414 files changed, 16560 insertions, 9157 deletions
diff --git a/llvm/test/Analysis/BasicAA/intrinsics.ll b/llvm/test/Analysis/BasicAA/intrinsics.ll
index 56d762b..52dd65d 100644
--- a/llvm/test/Analysis/BasicAA/intrinsics.ll
+++ b/llvm/test/Analysis/BasicAA/intrinsics.ll
@@ -10,8 +10,8 @@ define <8 x i16> @test0(ptr noalias %p, ptr noalias %q, <8 x i16> %y, <8 x i1> %
 ; CHECK-LABEL: define <8 x i16> @test0(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P]], i32 16, <8 x i1> [[M]], <8 x i16> [[PT]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr [[Q]], i32 16, <8 x i1> [[M]])
+; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P]], <8 x i1> [[M]], <8 x i16> [[PT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr align 16 [[Q]], <8 x i1> [[M]])
 ; CHECK-NEXT:    [[C:%.*]] = add <8 x i16> [[A]], [[A]]
 ; CHECK-NEXT:    ret <8 x i16> [[C]]
 ;
diff --git a/llvm/test/Analysis/BasicAA/ptr-vector.ll b/llvm/test/Analysis/BasicAA/ptr-vector.ll
index 7dea24f..598c170 100644
--- a/llvm/test/Analysis/BasicAA/ptr-vector.ll
+++ b/llvm/test/Analysis/BasicAA/ptr-vector.ll
@@ -1,8 +1,8 @@
 ; RUN: opt -print-all-alias-modref-info -passes=aa-eval -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MayAlias:	i8* %b, i8* %p
-; CHECK: Just Ref:  Ptr: i8* %p	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
-; CHECK: Just Ref:  Ptr: i8* %b	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
+; CHECK: Just Ref:  Ptr: i8* %p	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr align 8 %a, <1 x i1> %c, <1 x ptr> poison)
+; CHECK: Just Ref:  Ptr: i8* %b	<->  %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr align 8 %a, <1 x i1> %c, <1 x ptr> poison)
 define void @test(ptr %a, ptr %b, <1 x i1> %c) {
   %v1p = call <1 x ptr> @llvm.masked.load.v1p0.p0(ptr %a, i32 8, <1 x i1> %c, <1 x ptr> poison)
   %p = bitcast <1 x ptr> %v1p to ptr
diff --git a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
index ffd8259..5cf0ae9 100644
--- a/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
+++ b/llvm/test/Analysis/BasicAA/scalable-dse-aa.ll
@@ -10,12 +10,12 @@ define <vscale x 4 x float> @dead_scalable_store(ptr %0) {
 ; CHECK-NEXT:    [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
 ; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
 ; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
 ;
@@ -54,12 +54,12 @@ define <4 x float> @dead_scalable_store_fixed(ptr %0) {
 ; CHECK-NEXT:    [[GEP_0_48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 48
 ; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
 ; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <4 x i1> [[MASK]])
-; CHECK-NEXT:    [[FADDOP0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    [[FADDOP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_48]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_ARR_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_ARR_48]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <4 x float> [[FADDOP0]], [[FADDOP1]]
 ; CHECK-NEXT:    ret <4 x float> [[FADD]]
 ;
@@ -101,14 +101,14 @@ define <vscale x 4 x float> @scalable_store_partial_overwrite(ptr %0) {
 ; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
 ; CHECK-NEXT:    [[GEP_ARR_30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 30
 ; CHECK-NEXT:    [[GEP_ARR_48:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 48
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD_0_30:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_30]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_30]], ptr nonnull [[GEP_ARR_30]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_48]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_30:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_30]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_30]], ptr align 1 [[GEP_ARR_30]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_48:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_48]], ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_48]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
 ;
@@ -147,13 +147,13 @@ define <vscale x 4 x float> @dead_scalable_store_small_mask(ptr %0) {
 ; CHECK-NEXT:    [[GEP_0_46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 46
 ; CHECK-NEXT:    [[GEP_ARR_16:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 16
 ; CHECK-NEXT:    [[GEP_ARR_46:%.*]] = getelementptr inbounds nuw i8, ptr [[ARR]], i64 46
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD_0_46:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_0_46]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_46]], ptr nonnull [[GEP_ARR_46]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_16]], ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD_0_46:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_46]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0_46]], ptr align 1 [[GEP_ARR_46]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[SMALLMASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 2)
-; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_16]], i32 1, <vscale x 4 x i1> [[SMALLMASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[GEP_ARR_46]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_16]], <vscale x 4 x i1> [[SMALLMASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FADDOP1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_ARR_46]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FADD:%.*]] = fadd <vscale x 4 x float> [[FADDOP0]], [[FADDOP1]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[FADD]]
 ;
@@ -188,9 +188,9 @@ define <vscale x 4 x float> @dead_scalar_store(ptr noalias %0, ptr %1) {
 ; CHECK-LABEL: define <vscale x 4 x float> @dead_scalar_store(
 ; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i128(i128 0, i128 4)
@@ -210,9 +210,9 @@ define <4 x float> @dead_scalable_store_fixed_large_mask(ptr noalias %0, ptr %1)
 ; CHECK-NEXT:    [[MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i64 5
 ; CHECK-NEXT:    store i32 50, ptr [[GEP_5]], align 4
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[TMP0]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr nonnull [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP1]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[RETVAL]]
 ;
   %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 7)
@@ -237,9 +237,9 @@ define <vscale x 4 x float> @mask_gt_minimum_num_elts(ptr noalias %0, ptr %1) {
 ; CHECK-NEXT:    store i32 10, ptr [[GEP_1_12]], align 4
 ; CHECK-NEXT:    [[GEP_1_28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 28
 ; CHECK-NEXT:    store i32 20, ptr [[GEP_1_28]], align 4
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
@@ -260,9 +260,9 @@ define <vscale x 4 x float> @active_lane_mask_lt(ptr noalias %0, ptr %1) {
 ; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
 ; CHECK-NEXT:    store i32 20, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 4, i32 2)
@@ -280,9 +280,9 @@ define <vscale x 4 x float> @active_lane_mask_eq(ptr noalias %0, ptr %1) {
 ; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
 ; CHECK-NEXT:    store i32 20, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 2, i32 2)
@@ -300,9 +300,9 @@ define <vscale x 16 x i8> @scalar_stores_small_mask(ptr noalias %0, ptr %1) {
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i8(i8 0, i8 7)
 ; CHECK-NEXT:    [[GEP_1_8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 8
 ; CHECK-NEXT:    store i8 120, ptr [[GEP_1_8]], align 1
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[LOAD_0]], ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP0]], <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 16 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP1]], <vscale x 16 x i1> [[MASK]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 16 x i8> [[RETVAL]]
 ;
   %mask = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i8.i8(i8 0, i8 7)
@@ -323,9 +323,9 @@ define <vscale x 4 x float> @dead_scalar_store_offset(ptr noalias %0, ptr %1) {
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
 ; CHECK-NEXT:    [[GEP_1_12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
 ; CHECK-NEXT:    store i32 40, ptr [[GEP_1_12]], align 4
-; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP0]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr nonnull [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD_0]], ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[RETVAL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RETVAL]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 1, i32 4)
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
index 56ae1ac..b4ced24 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:64 Lat:96 SizeLat:96 for: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:31 Lat:46 SizeLat:46 for: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:124 Lat:184 SizeLat:184 for: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 8 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 8 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 8 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:64 Lat:96 SizeLat:96 for: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 8 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 8 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 8 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:32 Lat:48 SizeLat:48 for: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 8 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 8 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 8 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:8 Lat:12 SizeLat:12 for: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 8 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 8 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:31 Lat:46 SizeLat:46 for: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 8 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 8 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:15 Lat:22 SizeLat:22 for: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 8 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:7 Lat:10 SizeLat:10 for: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:16 Lat:24 SizeLat:24 for: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:152 CodeSize:124 Lat:184 SizeLat:184 for: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr align 8 undef, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -56,26 +56,26 @@ entry:
 
 define void @scalable() {
 ; CHECK-LABEL: 'scalable'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr undef, i32 16, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr align 8 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of Invalid for: %nxv4i1 = call <vscale x 4 x i1> @llvm.masked.load.nxv4i1.p0(ptr align 16 undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -113,69 +113,69 @@ entry:
 
 define void @scalable_ext_loads() {
 ; CHECK-LABEL: 'scalable_ext_loads'
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to16 = zext <vscale x 16 x i8> %load.nxv16i8 to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to32 = zext <vscale x 16 x i8> %load.nxv16i8.2 to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to64 = zext <vscale x 16 x i8> %load.nxv16i8.3 to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv8i8to16 = zext <vscale x 8 x i8> %load.nxv8i8 to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i8to32 = zext <vscale x 8 x i8> %load.nxv8i8.2 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i8to64 = zext <vscale x 8 x i8> %load.nxv8i8.3 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv4i8to32 = zext <vscale x 4 x i8> %load.nxv4i8 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i8to64 = zext <vscale x 4 x i8> %load.nxv4i8.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i8to64 = zext <vscale x 2 x i8> %load.nxv2i8 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i16to32 = zext <vscale x 8 x i16> %load.nxv8i16 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i16to64 = zext <vscale x 8 x i16> %load.nxv8i16.2 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv4i16to32 = zext <vscale x 4 x i16> %load.nxv4i16 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i16to64 = zext <vscale x 4 x i16> %load.nxv4i16.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i16to64 = zext <vscale x 2 x i16> %load.nxv2i16 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i32to64 = zext <vscale x 4 x i32> %load.nxv4i32 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zext.nxv2i32to64 = zext <vscale x 2 x i32> %load.nxv2i32 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to16 = sext <vscale x 16 x i8> %load2.nxv16i8 to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.2 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to32 = sext <vscale x 16 x i8> %load2.nxv16i8.2 to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv16i8.3 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to64 = sext <vscale x 16 x i8> %load2.nxv16i8.3 to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv8i8to16 = sext <vscale x 8 x i8> %load2.nxv8i8 to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.2 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i8to32 = sext <vscale x 8 x i8> %load2.nxv8i8.2 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i8.3 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i8to64 = sext <vscale x 8 x i8> %load2.nxv8i8.3 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv4i8to32 = sext <vscale x 4 x i8> %load2.nxv4i8 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i8.2 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i8to64 = sext <vscale x 4 x i8> %load2.nxv4i8.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i8to64 = sext <vscale x 2 x i8> %load2.nxv2i8 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i16to32 = sext <vscale x 8 x i16> %load2.nxv8i16 to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv8i16.2 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i16to64 = sext <vscale x 8 x i16> %load2.nxv8i16.2 to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv4i16to32 = sext <vscale x 4 x i16> %load2.nxv4i16 to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i16.2 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i16to64 = sext <vscale x 4 x i16> %load2.nxv4i16.2 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i16to64 = sext <vscale x 2 x i16> %load2.nxv2i16 to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i32to64 = sext <vscale x 4 x i32> %load2.nxv4i32 to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %load2.nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sext.nxv2i32to64 = sext <vscale x 2 x i32> %load2.nxv2i32 to <vscale x 2 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
index 0519454..fa53a18 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -19,13 +19,13 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @fixed_sve_vls() #0 {
 ; CHECK-LABEL: 'fixed_sve_vls'
-; CHECK:  Cost Model: Found an estimated cost of [[#div(2047,VBITS)+1]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i8> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(4091,VBITS)+1]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i16> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(511,VBITS)+1]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i32> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(1023,VBITS)+1]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i64> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0(ptr undef, i32 8, <512 x i1> undef, <512 x half> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0(ptr undef, i32 8, <256 x i1> undef, <256 x float> undef)
-; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0(ptr undef, i32 8, <128 x i1> undef, <128 x double> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(2047,VBITS)+1]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr align 8 undef, <256 x i1> undef, <256 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(4091,VBITS)+1]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0(ptr align 8 undef, <256 x i1> undef, <256 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(511,VBITS)+1]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 8 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(1023,VBITS)+1]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0(ptr align 8 undef, <16 x i1> undef, <16 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0(ptr align 8 undef, <512 x i1> undef, <512 x half> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0(ptr align 8 undef, <256 x i1> undef, <256 x float> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#div(8191,VBITS)+1]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0(ptr align 8 undef, <128 x i1> undef, <128 x double> undef)
 ; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 entry:
   %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0(ptr undef, i32 8, <256 x i1> undef, <256 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
index ae638e5..d031ac6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll
@@ -190,19 +190,19 @@ declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32 immarg, <4 x i1>,
 define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi8_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i8> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i8> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
@@ -212,19 +212,19 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi8_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i8> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %lv
 ;
   %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
@@ -235,19 +235,19 @@ declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32 immarg, <4
 define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi8_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi8_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -257,19 +257,19 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) {
 define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi8_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi8_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
@@ -280,19 +280,19 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>
 define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 ; CHECK:         gather_load_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi32_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
@@ -302,19 +302,19 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) {
 define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         gather_load_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
 ; CHECK-SVE-512-LABEL: 'gather_load_4xi32_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %ptrs, <4 x i1> %cond, <4 x i32> undef)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %lv
 ;
   %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
@@ -325,19 +325,19 @@ declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32 immarg, <
 define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; CHECK:         scatter_store_4xi32_constant_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:12 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi32_constant_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 true))
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> splat (i1 true))
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -347,19 +347,19 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs)
 define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) {
 ; CHECK:         scatter_store_4xi32_variable_mask
 ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'scatter_store_4xi32_variable_mask'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> align 1 %ptrs, <4 x i1> %cond)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond)
@@ -370,19 +370,19 @@ declare <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr>, i32, <256 x
 define void @sve_gather_vls(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_gather_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_gather_vls'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> align 2 undef, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -394,19 +394,19 @@ declare <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr>, i32, <256
 define void @sve_gather_vls_float(<256 x i1> %v256i1mask) {
 ; CHECK-LABEL: 'sve_gather_vls_float'
 ; CHECK-NEON-LABEL: 'sve_gather_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1664 CodeSize:1216 Lat:1920 SizeLat:1920 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_gather_vls_float'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> align 4 undef, <256 x i1> %v256i1mask, <256 x float> zeroinitializer)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -418,19 +418,19 @@ declare void @llvm.masked.scatter.v256i8.v256p0(<256 x i8>, <256 x ptr>, i32, <2
 define void @sve_scatter_vls(<256 x i1> %v256i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:1792 CodeSize:1280 Lat:2048 SizeLat:2048 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_scatter_vls'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 2560 for: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> align 1 undef, <256 x i1> %v256i1mask)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 entry:
@@ -442,19 +442,19 @@ declare void @llvm.masked.scatter.v512f16.v512p0(<512 x half>, <512 x ptr>, i32,
 define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){
 ; CHECK-LABEL: 'sve_scatter_vls_float'
 ; CHECK-NEON-LABEL: 'sve_scatter_vls_float'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:3456 CodeSize:2496 Lat:3968 SizeLat:3968 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-128-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-256-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-256-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-512-LABEL: 'sve_scatter_vls_float'
-; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
+; CHECK-SVE-512-NEXT:  Cost Model: Found costs of 5120 for: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> align 2 undef, <512 x i1> %v512i1mask)
 ; CHECK-SVE-512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index 35b9875..6e0b1ec 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -10,17 +10,17 @@ target triple="aarch64--linux-gnu"
 
 define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv4i1 = call <vscale x 4 x i1> @llvm.masked.gather.nxv4i1.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i1> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
@@ -32,15 +32,15 @@ define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nx
 
 define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers_tune_generic'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers_tune_generic'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask, <vscale x 8 x i32> zeroinitializer)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %res.nxv1i64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask, <vscale x 1 x i64> zeroinitializer)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask, <vscale x 4 x i32> zeroinitializer)
@@ -51,25 +51,25 @@ define void @masked_gathers_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale
 
 define void @masked_gathers_no_vscale_range() #2 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gathers_no_vscale_range'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gathers_no_vscale_range'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8f32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4f32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %res.nxv2f32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %res.nxv16i16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %res.nxv8i16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %res.nxv4i16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %res.nxv4f64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
@@ -88,11 +88,11 @@ define void @masked_gathers_no_vscale_range() #2 {
 
 define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> align 16 %ld, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> align 16 %ld, <2 x i1> %masks, <2 x i128> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
 ;
   %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
index 2c838e2..20e5dbc4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-illegal-types.ll
@@ -22,8 +22,8 @@ define void @load_store(ptr %ptrs) {
 
 define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_load_store'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr %ptrs, i32 8, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128.p0(ptr align 8 %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.store.nxv1i128.p0(<vscale x 1 x i128> %mload, ptr align 8 %ptrs, <vscale x 1 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %mload = call <vscale x 1 x i128> @llvm.masked.load.nxv1i128(ptr %val, i32 8, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
@@ -33,8 +33,8 @@ define void @masked_load_store(ptr %ptrs, ptr %val, <vscale x 1 x i1> %mask, <vs
 
 define void @masked_gather_scatter(<vscale x 1 x ptr> %ptrs, <vscale x 1 x ptr> %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru) {
 ; CHECK-LABEL: 'masked_gather_scatter'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> %ptrs, i32 0, <vscale x 1 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128.nxv1p0(<vscale x 1 x ptr> align 16 %val, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i128.nxv1p0(<vscale x 1 x i128> %mgather, <vscale x 1 x ptr> align 16 %ptrs, <vscale x 1 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %mgather = call <vscale x 1 x i128> @llvm.masked.gather.nxv1i128(<vscale x 1 x ptr> %val, i32 0, <vscale x 1 x i1> %mask, <vscale x 1 x i128> %passthru)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index 0976a10..e007800 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1069,15 +1069,15 @@ define void @fshl() #0 {
 
 define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x ptr> %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_nxv4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_nxv4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_nxv4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 4 x i32> %res
 ;
   %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru)
@@ -1086,15 +1086,15 @@ define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x ptr> %ld, <vscale
 
 define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_nxv8i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_nxv8i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_nxv8i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <vscale x 8 x i32> %res
 ;
   %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru)
@@ -1103,15 +1103,15 @@ define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x ptr> %ld, <vscale
 
 define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:36 SizeLat:36 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:20 Lat:36 SizeLat:36 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ld, <4 x i1> %masks, <4 x i32> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru)
@@ -1120,15 +1120,15 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32>
 
 define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) {
 ; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> align 16 %ld, <1 x i1> %masks, <1 x i128> %passthru)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i128> %res
 ;
   %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru)
@@ -1137,15 +1137,15 @@ define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i12
 
 define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_nxv4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 40 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_nxv4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_nxv4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %data, <vscale x 4 x ptr> align 4 %ptrs, <vscale x 4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1155,15 +1155,15 @@ define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x ptr>
 
 define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_nxv8i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_nxv8i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_nxv8i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of Invalid for: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> %data, <vscale x 8 x ptr> align 4 %ptrs, <vscale x 8 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1173,15 +1173,15 @@ define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x ptr>
 
 define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_v4i32'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_v4i32'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:20 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:16 Lat:28 SizeLat:28 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:16 Lat:28 SizeLat:28 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> align 4 %ptrs, <4 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
@@ -1191,15 +1191,15 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %ma
 
 define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatter_v1i128'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatter_v1i128'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:6 Lat:8 SizeLat:9 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128'
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> align 16 %ptrs, <1 x i1> %masks)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index b9defdf..0bb38de 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -10,24 +10,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #0 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i1.nxv4p0(<vscale x 4 x i1> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> %nxv4i1mask)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
@@ -39,21 +39,21 @@ define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %n
 
 define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) #1 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters_tune_generic'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters_tune_generic'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters_tune_generic'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 0, <vscale x 8 x i1> %nxv8i1mask)
-; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 0, <vscale x 1 x i1> %nxv1i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> %nxv4i1mask)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> %nxv8i1mask)
+; CHECK-V2-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> %nxv1i1mask)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 0, <vscale x 4 x i1> %nxv4i1mask)
@@ -64,36 +64,36 @@ define void @masked_scatters_tune_generic(<vscale x 4 x i1> %nxv4i1mask, <vscale
 
 define void @masked_scatters_no_vscale_range() #2 {
 ; CHECK-VSCALE-1-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; CHECK-V2-LABEL: 'masked_scatters_no_vscale_range'
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
 ; CHECK-V2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index 1570fc7..c2248c2 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1081,15 +1081,15 @@ define i32 @load_fptrunc() {
 
 define i32 @maskedload_extends() {
 ; CHECK-NEON-LABEL: 'maskedload_extends'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1133,15 +1133,15 @@ define i32 @maskedload_extends() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_extends'
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1185,15 +1185,15 @@ define i32 @maskedload_extends() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_extends'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1237,15 +1237,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_extends'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1289,15 +1289,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_extends'
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x i32> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 undef, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2816s = sext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2816u = zext <2 x i8> %loadv2i8 to <2 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v2832s = sext <2 x i8> %loadv2i8 to <2 x i32>
@@ -1418,26 +1418,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedstore_trunc'
@@ -1461,26 +1461,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedstore_trunc'
@@ -1504,26 +1504,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedstore_trunc'
@@ -1547,26 +1547,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedstore_trunc'
@@ -1590,26 +1590,26 @@ define i32 @maskedstore_trunc() {
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %v81664 = trunc <8 x i64> undef to <8 x i16>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 1 for: %v23264 = trunc <2 x i64> undef to <2 x i32>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 0 for: %v43264 = trunc <4 x i64> undef to <4 x i32>
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr undef, i32 1, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr undef, i32 1, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr undef, i32 1, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr undef, i32 1, <16 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2816, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2832, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i8.p0(<2 x i8> %v2864, ptr align 1 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4816, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4832, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i8.p0(<4 x i8> %v4864, ptr align 1 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8816, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8832, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> %v8864, ptr align 1 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16816, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16832, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> %v16864, ptr align 1 undef, <16 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i16.p0(<2 x i16> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %v2816 = trunc <2 x i16> undef to <2 x i8>
@@ -1661,13 +1661,13 @@ define i32 @maskedstore_trunc() {
 
 define i32 @maskedload_fpextends() {
 ; CHECK-NEON-LABEL: 'maskedload_fpextends'
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1684,13 +1684,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_fpextends'
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 2 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1707,13 +1707,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_fpextends'
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1730,13 +1730,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_fpextends'
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1753,13 +1753,13 @@ define i32 @maskedload_fpextends() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_fpextends'
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 4, <8 x i1> undef, <8 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 128 for: %loadv16f16 = call <16 x half> @llvm.masked.load.v16f16.p0(ptr align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: %loadv2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 undef, <8 x i1> undef, <8 x float> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 20 for: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 40 for: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 80 for: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
@@ -1811,14 +1811,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-MVE-LABEL: 'maskedload_fptrunc'
@@ -1830,14 +1830,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-LABEL: 'maskedload_fptrunc'
@@ -1849,14 +1849,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-MAIN-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8M-BASE-LABEL: 'maskedload_fptrunc'
@@ -1868,14 +1868,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 20 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 40 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8M-BASE-NEXT:  Cost Model: Found costs of 1 for: ret i32 undef
 ;
 ; CHECK-V8R-LABEL: 'maskedload_fptrunc'
@@ -1887,14 +1887,14 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of 80 for: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %v43264 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr undef, i32 2, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr undef, i32 2, <4 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr undef, i32 2, <8 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr undef, i32 4, <2 x i1> undef)
-; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr undef, i32 4, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21632, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f16.p0(<2 x half> %v21664, ptr align 2 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41632, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f16.p0(<4 x half> %v41664, ptr align 2 undef, <4 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81632, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.store.v8f16.p0(<8 x half> %v81664, ptr align 2 undef, <8 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %v23264, ptr align 4 undef, <2 x i1> undef)
+; CHECK-V8R-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.store.v4f32.p0(<4 x float> %v43264, ptr align 4 undef, <4 x i1> undef)
 ; CHECK-V8R-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef
 ;
   %v21632 = fptrunc <2 x float> undef to <2 x half>
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
index 7a40252..6377437 100644
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
@@ -256,19 +256,19 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
@@ -277,19 +277,19 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
index 5a23ebf..5281b5d 100644
--- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll
@@ -5,32 +5,32 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @masked_gather() {
 ; CHECK-LABEL: 'masked_gather'
-; CHECK-NEXT:  Cost Model: Found costs of 16 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 96 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 24 for: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 34 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 192 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 80 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 160 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 320 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 144 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x ptr> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
@@ -72,31 +72,31 @@ define i32 @masked_gather() {
 
 define i32 @masked_scatter() {
 ; CHECK-LABEL: 'masked_scatter'
-; CHECK-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 96 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 24 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 12 for: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 34 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 192 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 320 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 144 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 96 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 24 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 12 for: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 6 for: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 34 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 192 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 80 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 160 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 320 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 144 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 72 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 36 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 18 for: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
@@ -137,27 +137,27 @@ define i32 @masked_scatter() {
 define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i32'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep1, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> align 4 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep2, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> align 4 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gep3, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> align 4 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 %gepu, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> align 1 %gepu, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %geposb, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> align 4 %geposb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %gepbsb, <4 x i1> %mask, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> align 4 %gepbsb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
@@ -196,27 +196,27 @@ define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4f32'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep1, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> align 4 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep2, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> align 4 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep3, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> align 4 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 %gepu, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 32 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> align 1 %gepu, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %geposb, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> align 4 %geposb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gepbsb, <4 x i1> %mask, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> align 4 %gepbsb, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
@@ -255,25 +255,25 @@ define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4
 define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep1, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> align 2 %gep1, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> align 2 %gep2, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep3, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 56 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> align 2 %gep3, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <4 x i16> %res5 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <4 x i16> %res6 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
@@ -309,14 +309,14 @@ define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %
 define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v4i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res5zext = zext <4 x i8> %res5 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 2 %gep5, <4 x i1> %mask, <4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %res6sext = sext <4 x i8> %res6 to <4 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> align 4 %gep5, <4 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
@@ -338,34 +338,34 @@ define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i16'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep1, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> align 2 %gep1, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> align 2 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep3, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> align 2 %gep3, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 %gep2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> align 1 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %geposb, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> align 2 %geposb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gepbsb, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 112 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> align 2 %gepbsb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> align 2 %gep4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 %gep4, <8 x i1> %mask, <8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %ressext = sext <8 x i16> %res to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> align 4 %gep4, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -419,26 +419,26 @@ define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8
 define void @gep_v8f16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8f16'
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep1, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> align 2 %gep1, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> align 2 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gep3, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> align 2 %gep3, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 1 %gep2, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> align 1 %gep2, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %geposb, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> align 2 %geposb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 %gepbsb, <8 x i1> %mask, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 64 for: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> align 2 %gepbsb, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -481,14 +481,14 @@ define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v8i8'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <8 x i8> %ind8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 2 %gep5, <8 x i1> %mask, <8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res5zext = zext <8 x i8> %res5 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> align 4 %gep5, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 2 %gep5, <8 x i1> %mask, <8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %res6sext = sext <8 x i8> %res6 to <8 x i16>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> align 4 %gep5, <8 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; result zext
@@ -511,24 +511,24 @@ define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 %gep1, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> align 2 %gep1, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext = zext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gep2, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> align 2 %gep2, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indsext = sext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gep3, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> align 2 %gep3, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gepbsb, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> align 2 %gepbsb, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:74 CodeSize:1 Lat:1 SizeLat:1 for: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> align 2 %gep4, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   ; no offset ext
@@ -566,8 +566,8 @@ define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind3
 define void @gep_v16i8p(<16 x ptr> %base, i32 %off, <16 x i1> %mask)  {
 ; CHECK-LABEL: 'gep_v16i8p'
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 2 %gepbs, <16 x i1> %mask, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of 224 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> align 2 %gepbs, <16 x i1> %mask)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
index 6eec7ed..0f4b1d1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll
@@ -6,85 +6,85 @@
 
 define i32 @masked_gather() {
 ; CHECK-LABEL: 'masked_gather'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 8, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 8, <1 x i1> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 8, <1 x i1> undef, <1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 undef, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 8 undef, <1 x i1> undef, <1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1F16 = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 8 undef, <1 x i1> undef, <1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I32 = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 2 undef, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 2 undef, <16 x i1> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 2 undef, <8 x i1> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 2 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 2 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> align 2 undef, <1 x i1> undef, <1 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 4 undef, <8 x i1> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 4 undef, <1 x i1> undef, <1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 1 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0(<1 x ptr> align 1 undef, <1 x i1> undef, <1 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 8, <8 x i1> undef, <8 x double> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
index 338683e..fe90a1a 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll
@@ -6,85 +6,85 @@
 
 define i32 @masked_scatter() {
 ; CHECK-LABEL: 'masked_scatter'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 8, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 8, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 8, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 8, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 8, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 8, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 8 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 8 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 8 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 8 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> align 4 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 8 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 8 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 8 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 8 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 4 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 4 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 4 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 4 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> align 4 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 2 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0(<1 x i8> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 2 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 2 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 2 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 2 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> align 2 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0(<1 x half> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> align 1 undef, <2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0(<1 x i16> undef, <1 x ptr> align 1 undef, <1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 8, <8 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/gep.ll b/llvm/test/Analysis/CostModel/RISCV/gep.ll
index f8c3700..12dffb4 100644
--- a/llvm/test/Analysis/CostModel/RISCV/gep.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/gep.ll
@@ -264,9 +264,9 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x1 = load volatile <2 x i8>, ptr %1, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr %2, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 %2, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = getelementptr i8, ptr %base, i32 42
@@ -276,9 +276,9 @@ define void @non_foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store volatile <2 x i8> undef, ptr %7, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = getelementptr i8, ptr %base, i32 42
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr %8, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr align 1 %8, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> <i32 42, i32 43>
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = getelementptr i8, ptr %base, i32 42
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = getelementptr i8, ptr %base, i32 42
@@ -334,9 +334,9 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x1 = load volatile <2 x i8>, ptr %1, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %2 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr %2, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x2 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 %2, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %3 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> %3, i32 1, <2 x i1> undef, <2 x i8> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x3 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 %3, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %4 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %x4 = call <2 x i8> @llvm.masked.expandload.v2i8(ptr %4, <2 x i1> undef, <2 x i8> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %5 = getelementptr i8, ptr %base, i32 0
@@ -346,9 +346,9 @@ define void @foldable_vector_uses(ptr %base, <2 x ptr> %base.vec) {
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %7 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store volatile <2 x i8> undef, ptr %7, align 2
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %8 = getelementptr i8, ptr %base, i32 0
-; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr %8, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.store.v2i8.p0(<2 x i8> undef, ptr align 1 %8, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %9 = getelementptr i8, <2 x ptr> %base.vec, <2 x i32> zeroinitializer
-; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> %9, i32 1, <2 x i1> undef)
+; RVI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> align 1 %9, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = getelementptr i8, ptr %base, i32 0
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.compressstore.v2i8(<2 x i8> undef, ptr %10, <2 x i1> undef)
 ; RVI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = getelementptr i8, ptr %base, i32 0
diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
index 892277a..e572be7 100644
--- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
@@ -3,24 +3,24 @@
 
 define void @fixed() {
 ; CHECK-LABEL: 'fixed'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 8 undef, <2 x i1> undef, <2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 8 undef, <4 x i1> undef, <4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 8 undef, <8 x i1> undef, <8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 8 undef, <16 x i1> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr align 8 undef, <2 x i1> undef, <2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 8 undef, <4 x i1> undef, <4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 8 undef, <8 x i1> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 8 undef, <2 x i1> undef, <2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 8 undef, <4 x i1> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 undef, <2 x i1> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr align 8 undef, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr align 8 undef, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr align 8 undef, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 8 undef, <2 x i1> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 8 undef, <4 x i1> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 8 undef, <2 x i1> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 undef, <4 x i1> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr align 8 undef, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
@@ -53,25 +53,25 @@ entry:
 
 define void @scalable() {
 ; CHECK-LABEL: 'scalable'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 8, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr undef, i32 8, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 8 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16 = call <vscale x 2 x half> @llvm.masked.load.nxv2f16.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16 = call <vscale x 4 x half> @llvm.masked.load.nxv4f16.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16 = call <vscale x 8 x half> @llvm.masked.load.nxv8f16.p0(ptr align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32 = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64 = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = call <vscale x 4 x i64> @llvm.masked.load.nxv4i64.p0(ptr align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16 = call <vscale x 32 x half> @llvm.masked.load.nxv32f16.p0(ptr align 8 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
index 8504b4c..997ec12 100644
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-gather.ll
@@ -45,79 +45,79 @@ define void @masked_gather_aligned() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_gather_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_gather_aligned'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F64 = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F64 = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F64 = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F32 = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F32 = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F32 = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F32 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F32 = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I64 = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I64 = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I64 = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I64 = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I32 = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I32 = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I32 = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I16 = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I16 = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I16 = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I16 = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I16 = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I16 = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V64I8 = call <vscale x 64 x i8> @llvm.masked.gather.nxv64i8.nxv64p0(<vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef, <vscale x 64 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32I8 = call <vscale x 32 x i8> @llvm.masked.gather.nxv32i8.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16I8 = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8I8 = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4I8 = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2I8 = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1I8 = call <vscale x 1 x i8> @llvm.masked.gather.nxv1i8.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i8> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V8F64 = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
@@ -176,21 +176,21 @@ define void @masked_gather_aligned_f16() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_gather_aligned_f16'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_gather_aligned_f16'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V16F16 = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V8F16 = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V4F16 = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V2F16 = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: %V1F16 = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32F16 = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
@@ -205,34 +205,34 @@ define void @masked_gather_aligned_f16() {
 
 define void @masked_gather_unaligned() {
 ; CHECK-LABEL: 'masked_gather_unaligned'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64.u = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64.u = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F64.u = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F32.u = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32.u = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32.u = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32.u = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F32.u = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I64.u = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I64.u = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I64.u = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I64.u = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I32.u = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I32.u = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I32.u = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I32.u = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I32.u = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32I16.u = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I16.u = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I16.u = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I16.u = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I16.u = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I16.u = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F64.u = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F64.u = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F64.u = call <vscale x 1 x double> @llvm.masked.gather.nxv1f64.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F32.u = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F32.u = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32.nxv8p0(<vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F32.u = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F32.u = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F32.u = call <vscale x 1 x float> @llvm.masked.gather.nxv1f32.nxv1p0(<vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I64.u = call <vscale x 8 x i64> @llvm.masked.gather.nxv8i64.nxv8p0(<vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef, <vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I64.u = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef, <vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I64.u = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef, <vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I64.u = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef, <vscale x 1 x i64> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I32.u = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I32.u = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I32.u = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I32.u = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I32.u = call <vscale x 1 x i32> @llvm.masked.gather.nxv1i32.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i32> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32I16.u = call <vscale x 32 x i16> @llvm.masked.gather.nxv32i16.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16I16.u = call <vscale x 16 x i16> @llvm.masked.gather.nxv16i16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8I16.u = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4I16.u = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2I16.u = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1I16.u = call <vscale x 1 x i16> @llvm.masked.gather.nxv1i16.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x i16> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8PTR = call <vscale x 8 x ptr> @llvm.masked.gather.nxv8p0.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4PTR = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2PTR = call <vscale x 2 x ptr> @llvm.masked.gather.nxv2p0.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x ptr> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1PTR = call <vscale x 1 x ptr> @llvm.masked.gather.nxv1p0.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x ptr> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V8F64.u = call <vscale x 8 x double> @llvm.masked.gather.nxv8f64.nxv8p0(<vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef, <vscale x 8 x double> undef)
@@ -274,12 +274,12 @@ define void @masked_gather_unaligned() {
 
 define void @masked_gather_unaligned_f16() {
 ; CHECK-LABEL: 'masked_gather_unaligned_f16'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16.u = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16.u = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16.u = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F16.u = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F16.u = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V16F16.u = call <vscale x 16 x half> @llvm.masked.gather.nxv16f16.nxv16p0(<vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V8F16.u = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16.nxv8p0(<vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef, <vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V4F16.u = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16.nxv4p0(<vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef, <vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V2F16.u = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16.nxv2p0(<vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef, <vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %V1F16.u = call <vscale x 1 x half> @llvm.masked.gather.nxv1f16.nxv1p0(<vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef, <vscale x 1 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32F16.u = call <vscale x 32 x half> @llvm.masked.gather.nxv32f16.nxv32p0(<vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef, <vscale x 32 x half> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
index 0e3c43a..69abcde 100644
--- a/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/scalable-scatter.ll
@@ -45,79 +45,79 @@ define void @masked_scatter_aligned() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_scatter_aligned'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_scatter_aligned'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 4, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 4, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 4, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 4, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 4, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> undef, i32 1, <vscale x 64 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 8, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 8, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 8, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 4 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 4 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 4 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 4 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 4 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv64i8.nxv64p0(<vscale x 64 x i8> undef, <vscale x 64 x ptr> align 1 undef, <vscale x 64 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i8.nxv32p0(<vscale x 32 x i8> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i8.nxv2p0(<vscale x 2 x i8> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i8.nxv1p0(<vscale x 1 x i8> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 8 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 8 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 8 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 8 undef, <vscale x 1 x i1> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 8, <vscale x 8 x i1> undef)
@@ -176,21 +176,21 @@ define void @masked_scatter_aligned_f16() {
 ; GENERIC-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; MAX256-LABEL: 'masked_scatter_aligned_f16'
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; MAX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
 ; MAX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; UNSUPPORTED-LABEL: 'masked_scatter_aligned_f16'
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 2 undef, <vscale x 32 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; UNSUPPORTED-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
 ; UNSUPPORTED-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 2, <vscale x 32 x i1> undef)
@@ -205,34 +205,34 @@ define void @masked_scatter_aligned_f16() {
 
 define void @masked_scatter_unaligned() {
 ; CHECK-LABEL: 'masked_scatter_unaligned'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> undef, i32 2, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> undef, i32 2, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> undef, i32 2, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> undef, i32 2, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f64.nxv4p0(<vscale x 4 x double> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f64.nxv1p0(<vscale x 1 x double> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f32.nxv16p0(<vscale x 16 x float> undef, <vscale x 16 x ptr> align 2 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f32.nxv8p0(<vscale x 8 x float> undef, <vscale x 8 x ptr> align 2 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> undef, <vscale x 4 x ptr> align 2 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> undef, <vscale x 2 x ptr> align 2 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f32.nxv1p0(<vscale x 1 x float> undef, <vscale x 1 x ptr> align 2 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i64.nxv8p0(<vscale x 8 x i64> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i32.nxv16p0(<vscale x 16 x i32> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0(<vscale x 8 x i32> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i32.nxv1p0(<vscale x 1 x i32> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32i16.nxv32p0(<vscale x 32 x i16> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16i16.nxv16p0(<vscale x 16 x i16> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2i16.nxv2p0(<vscale x 2 x i16> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1i16.nxv1p0(<vscale x 1 x i16> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8p0.nxv8p0(<vscale x 8 x ptr> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4p0.nxv4p0(<vscale x 4 x ptr> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1p0.nxv1p0(<vscale x 1 x ptr> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv8f64.nxv8p0(<vscale x 8 x double> undef, <vscale x 8 x ptr> undef, i32 2, <vscale x 8 x i1> undef)
@@ -274,12 +274,12 @@ define void @masked_scatter_unaligned() {
 
 define void @masked_scatter_unaligned_f16() {
 ; CHECK-LABEL: 'masked_scatter_unaligned_f16'
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> undef, i32 1, <vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> undef, i32 1, <vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> undef, i32 1, <vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> undef, i32 1, <vscale x 2 x i1> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> undef, i32 1, <vscale x 1 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> align 1 undef, <vscale x 32 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv16f16.nxv16p0(<vscale x 16 x half> undef, <vscale x 16 x ptr> align 1 undef, <vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv8f16.nxv8p0(<vscale x 8 x half> undef, <vscale x 8 x ptr> align 1 undef, <vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv4f16.nxv4p0(<vscale x 4 x half> undef, <vscale x 4 x ptr> align 1 undef, <vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv2f16.nxv2p0(<vscale x 2 x half> undef, <vscale x 2 x ptr> align 1 undef, <vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: call void @llvm.masked.scatter.nxv1f16.nxv1p0(<vscale x 1 x half> undef, <vscale x 1 x ptr> align 1 undef, <vscale x 1 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call void @llvm.masked.scatter.nxv32f16.nxv32p0(<vscale x 32 x half> undef, <vscale x 32 x ptr> undef, i32 1, <vscale x 32 x i1> undef)
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index bcef47e..8ed8b2e 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -310,19 +310,19 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
@@ -331,19 +331,19 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
index aea7cc8..0854378 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
@@ -10,298 +10,298 @@
 
 define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -371,298 +371,298 @@ define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
@@ -732,192 +732,192 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1571,22 +1571,22 @@ define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x
 define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:4 SizeLat:5 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX-LABEL: 'test1'
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test1'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -1597,22 +1597,22 @@ define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 ; SSE2-LABEL: 'test2'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test2'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX512-LABEL: 'test2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1623,22 +1623,22 @@ define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 ; SSE2-LABEL: 'test3'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test3'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test3'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1649,32 +1649,32 @@ define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 ; SSE2-LABEL: 'test4'
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SSE42-LABEL: 'test4'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX2-LABEL: 'test4'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SKL-LABEL: 'test4'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX512-LABEL: 'test4'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -1685,22 +1685,22 @@ define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 ; SSE2-LABEL: 'test5'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test5'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test5'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1711,22 +1711,22 @@ define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 ; SSE2-LABEL: 'test6'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test6'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test6'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1737,22 +1737,22 @@ define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 ; SSE2-LABEL: 'test7'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; SSE42-LABEL: 'test7'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX512-LABEL: 'test7'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1763,22 +1763,22 @@ define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 ; SSE2-LABEL: 'test8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; SSE42-LABEL: 'test8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX512-LABEL: 'test8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1788,27 +1788,27 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,31 +1817,31 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1850,31 +1850,31 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32_const_mask'
-; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32_const_mask'
-; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
-; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32_const_mask'
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
-; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1885,37 +1885,37 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1929,37 +1929,37 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1973,37 +1973,37 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2019,7 +2019,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -2027,7 +2027,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
@@ -2035,7 +2035,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -2043,7 +2043,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
@@ -2132,19 +2132,19 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2153,23 +2153,23 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2180,43 +2180,43 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2230,43 +2230,43 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
index 742b5b2..b68975f 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@@ -10,298 +10,298 @@
 
 define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_load'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:38 Lat:38 SizeLat:38 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:28 Lat:28 SizeLat:28 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:86 Lat:86 SizeLat:86 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:67 CodeSize:81 Lat:81 SizeLat:81 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:63 CodeSize:76 Lat:76 SizeLat:76 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:59 CodeSize:71 Lat:71 SizeLat:71 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:66 Lat:66 SizeLat:66 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:61 Lat:61 SizeLat:61 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:47 CodeSize:56 Lat:56 SizeLat:56 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:40 Lat:40 SizeLat:40 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:35 Lat:35 SizeLat:35 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:30 Lat:30 SizeLat:30 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:17 Lat:17 SizeLat:17 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:38 CodeSize:45 Lat:45 SizeLat:45 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:33 Lat:33 SizeLat:33 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:86 CodeSize:101 Lat:101 SizeLat:101 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:81 CodeSize:95 Lat:95 SizeLat:95 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:89 Lat:89 SizeLat:89 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:83 Lat:83 SizeLat:83 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:66 CodeSize:77 Lat:77 SizeLat:77 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:71 Lat:71 SizeLat:71 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:65 Lat:65 SizeLat:65 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:47 Lat:47 SizeLat:47 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:41 Lat:41 SizeLat:41 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:35 Lat:35 SizeLat:35 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:20 Lat:20 SizeLat:20 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_load'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:32 Lat:32 SizeLat:32 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:24 Lat:24 SizeLat:24 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_load'
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found costs of 4 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 5 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_load'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_load'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr undef, i32 1, <15 x i1> %m15, <15 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr undef, i32 1, <14 x i1> %m14, <14 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr undef, i32 1, <13 x i1> %m13, <13 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr undef, i32 1, <12 x i1> %m12, <12 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr undef, i32 1, <11 x i1> %m11, <11 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr undef, i32 1, <10 x i1> %m10, <10 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr undef, i32 1, <9 x i1> %m9, <9 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr undef, i32 1, <7 x i1> %m7, <7 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr undef, i32 1, <6 x i1> %m6, <6 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr undef, i32 1, <5 x i1> %m5, <5 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr undef, i32 1, <3 x i1> %m3, <3 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr undef, i32 1, <1 x i1> %m1, <1 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F64 = call <7 x double> @llvm.masked.load.v7f64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F64 = call <6 x double> @llvm.masked.load.v6f64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F64 = call <5 x double> @llvm.masked.load.v5f64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F64 = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F64 = call <3 x double> @llvm.masked.load.v3f64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2F64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.load.v1f64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16F32 = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15F32 = call <15 x float> @llvm.masked.load.v15f32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14F32 = call <14 x float> @llvm.masked.load.v14f32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13F32 = call <13 x float> @llvm.masked.load.v13f32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12F32 = call <12 x float> @llvm.masked.load.v12f32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11F32 = call <11 x float> @llvm.masked.load.v11f32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10F32 = call <10 x float> @llvm.masked.load.v10f32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9F32 = call <9 x float> @llvm.masked.load.v9f32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8F32 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7F32 = call <7 x float> @llvm.masked.load.v7f32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6F32 = call <6 x float> @llvm.masked.load.v6f32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5F32 = call <5 x float> @llvm.masked.load.v5f32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4F32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3F32 = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2F32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F32 = call <1 x float> @llvm.masked.load.v1f32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I64 = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I64 = call <7 x i64> @llvm.masked.load.v7i64.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I64 = call <6 x i64> @llvm.masked.load.v6i64.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I64 = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I64 = call <3 x i64> @llvm.masked.load.v3i64.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V2I64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I32 = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V15I32 = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 1 undef, <15 x i1> %m15, <15 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V14I32 = call <14 x i32> @llvm.masked.load.v14i32.p0(ptr align 1 undef, <14 x i1> %m14, <14 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V13I32 = call <13 x i32> @llvm.masked.load.v13i32.p0(ptr align 1 undef, <13 x i1> %m13, <13 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V12I32 = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 1 undef, <12 x i1> %m12, <12 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V11I32 = call <11 x i32> @llvm.masked.load.v11i32.p0(ptr align 1 undef, <11 x i1> %m11, <11 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V10I32 = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 1 undef, <10 x i1> %m10, <10 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V9I32 = call <9 x i32> @llvm.masked.load.v9i32.p0(ptr align 1 undef, <9 x i1> %m9, <9 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I32 = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V7I32 = call <7 x i32> @llvm.masked.load.v7i32.p0(ptr align 1 undef, <7 x i1> %m7, <7 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V6I32 = call <6 x i32> @llvm.masked.load.v6i32.p0(ptr align 1 undef, <6 x i1> %m6, <6 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V5I32 = call <5 x i32> @llvm.masked.load.v5i32.p0(ptr align 1 undef, <5 x i1> %m5, <5 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V4I32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V3I32 = call <3 x i32> @llvm.masked.load.v3i32.p0(ptr align 1 undef, <3 x i1> %m3, <3 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V2I32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I32 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 1 undef, <1 x i1> %m1, <1 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I16 = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I16 = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V8I16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V4I16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.load.v8f64.p0(ptr undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -371,298 +371,298 @@ define i32 @masked_load(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4, <5 x i1> %m5, <6 x i1> %m6, <7 x i1> %m7, <8 x i1> %m8, <9 x i1> %m9, <10 x i1> %m10, <11 x i1> %m11, <12 x i1> %m12, <13 x i1> %m13, <14 x i1> %m14, <15 x i1> %m15, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_store'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:75 CodeSize:91 Lat:91 SizeLat:91 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:85 Lat:85 SizeLat:85 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:79 Lat:79 SizeLat:79 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:68 Lat:68 SizeLat:68 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:62 Lat:62 SizeLat:62 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:56 Lat:56 SizeLat:56 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:43 CodeSize:51 Lat:51 SizeLat:51 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:44 Lat:44 SizeLat:44 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:91 CodeSize:107 Lat:107 SizeLat:107 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:85 CodeSize:100 Lat:100 SizeLat:100 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:73 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:68 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:73 Lat:73 SizeLat:73 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:50 CodeSize:59 Lat:59 SizeLat:59 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:53 Lat:53 SizeLat:53 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:46 Lat:46 SizeLat:46 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:32 Lat:32 SizeLat:32 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:158 CodeSize:190 Lat:190 SizeLat:190 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:79 CodeSize:95 Lat:95 SizeLat:95 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:440 Lat:440 SizeLat:440 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:188 CodeSize:220 Lat:220 SizeLat:220 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:110 Lat:110 SizeLat:110 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:46 CodeSize:54 Lat:54 SizeLat:54 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_store'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:27 Lat:27 SizeLat:27 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:13 Lat:13 SizeLat:13 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:76 Lat:76 SizeLat:76 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:66 Lat:66 SizeLat:66 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:57 Lat:57 SizeLat:57 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:41 CodeSize:52 Lat:52 SizeLat:52 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:42 Lat:42 SizeLat:42 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:28 Lat:28 SizeLat:28 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:14 Lat:14 SizeLat:14 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:75 Lat:75 SizeLat:75 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:60 Lat:60 SizeLat:60 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:55 Lat:55 SizeLat:55 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:50 Lat:50 SizeLat:50 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:36 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:30 Lat:30 SizeLat:30 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:25 SizeLat:25 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:320 Lat:320 SizeLat:320 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:160 Lat:160 SizeLat:160 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX-LABEL: 'masked_store'
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of 16 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; AVX-NEXT:  Cost Model: Found costs of 17 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; AVX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; AVX-NEXT:  Cost Model: Found costs of RThru:131 CodeSize:163 Lat:163 SizeLat:163 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; AVX-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; AVX-NEXT:  Cost Model: Found costs of RThru:260 CodeSize:324 Lat:324 SizeLat:324 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; AVX-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; AVX-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; AVX-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_store'
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; KNL-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; KNL-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:132 CodeSize:164 Lat:164 SizeLat:164 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:262 CodeSize:326 Lat:326 SizeLat:326 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:80 SizeLat:80 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:40 SizeLat:40 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_store'
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr undef, i32 1, <15 x i1> %m15)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr undef, i32 1, <14 x i1> %m14)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr undef, i32 1, <13 x i1> %m13)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr undef, i32 1, <12 x i1> %m12)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr undef, i32 1, <11 x i1> %m11)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr undef, i32 1, <10 x i1> %m10)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr undef, i32 1, <9 x i1> %m9)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr undef, i32 1, <7 x i1> %m7)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr undef, i32 1, <6 x i1> %m6)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr undef, i32 1, <5 x i1> %m5)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr undef, i32 1, <3 x i1> %m3)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f64.p0(<7 x double> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f64.p0(<6 x double> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f64.p0(<5 x double> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f64.p0(<4 x double> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f64.p0(<3 x double> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2f64.p0(<2 x double> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f64.p0(<1 x double> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16f32.p0(<16 x float> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15f32.p0(<15 x float> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14f32.p0(<14 x float> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13f32.p0(<13 x float> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12f32.p0(<12 x float> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11f32.p0(<11 x float> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10f32.p0(<10 x float> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9f32.p0(<9 x float> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8f32.p0(<8 x float> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7f32.p0(<7 x float> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6f32.p0(<6 x float> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5f32.p0(<5 x float> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4f32.p0(<4 x float> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3f32.p0(<3 x float> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1f32.p0(<1 x float> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i64.p0(<8 x i64> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i64.p0(<7 x i64> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i64.p0(<6 x i64> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i64.p0(<5 x i64> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i64.p0(<4 x i64> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i64.p0(<3 x i64> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v2i64.p0(<2 x i64> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i64.p0(<1 x i64> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i32.p0(<16 x i32> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v15i32.p0(<15 x i32> undef, ptr align 1 undef, <15 x i1> %m15)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v14i32.p0(<14 x i32> undef, ptr align 1 undef, <14 x i1> %m14)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v13i32.p0(<13 x i32> undef, ptr align 1 undef, <13 x i1> %m13)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v12i32.p0(<12 x i32> undef, ptr align 1 undef, <12 x i1> %m12)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v11i32.p0(<11 x i32> undef, ptr align 1 undef, <11 x i1> %m11)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v10i32.p0(<10 x i32> undef, ptr align 1 undef, <10 x i1> %m10)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v9i32.p0(<9 x i32> undef, ptr align 1 undef, <9 x i1> %m9)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i32.p0(<8 x i32> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v7i32.p0(<7 x i32> undef, ptr align 1 undef, <7 x i1> %m7)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v6i32.p0(<6 x i32> undef, ptr align 1 undef, <6 x i1> %m6)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v5i32.p0(<5 x i32> undef, ptr align 1 undef, <5 x i1> %m5)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v3i32.p0(<3 x i32> undef, ptr align 1 undef, <3 x i1> %m3)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> undef, ptr align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.store.v1i32.p0(<1 x i32> undef, ptr align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i16.p0(<32 x i16> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i16.p0(<16 x i16> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v8i16.p0(<8 x i16> undef, ptr align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v4i16.p0(<4 x i16> undef, ptr align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v64i8.p0(<64 x i8> undef, ptr align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v32i8.p0(<32 x i8> undef, ptr align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v16i8.p0(<16 x i8> undef, ptr align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v8i8.p0(<8 x i8> undef, ptr align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.store.v8f64.p0(<8 x double> undef, ptr undef, i32 1, <8 x i1> %m8)
@@ -732,192 +732,192 @@ define i32 @masked_store(<1 x i1> %m1, <2 x i1> %m2, <3 x i1> %m3, <4 x i1> %m4,
 
 define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_gather'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_gather'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_gather'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_gather'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_gather'
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:2 Lat:12 SizeLat:12 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_gather'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_gather'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x double> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x float> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> undef, i32 1, <1 x i1> %m1, <1 x i64> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 1, <2 x i1> %m2, <2 x i32> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> %m4, <4 x i16> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> undef, i32 1, <64 x i1> %m64, <64 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> %m32, <32 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> %m16, <16 x i8> undef)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x double> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I64 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: %V1I64 = call <1 x i64> @llvm.masked.gather.v1i64.v1p0(<1 x ptr> align 1 undef, <1 x i1> %m1, <1 x i64> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 undef, <2 x i1> %m2, <2 x i32> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: %V32I16 = call <32 x i16> @llvm.masked.gather.v32i16.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 1 undef, <4 x i1> %m4, <4 x i16> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0(<64 x ptr> align 1 undef, <64 x i1> %m64, <64 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 undef, <32 x i1> %m32, <32 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 undef, <16 x i1> %m16, <16 x i8> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 undef, <8 x i1> %m8, <8 x i8> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> undef, i32 1, <8 x i1> %m8, <8 x double> undef)
@@ -955,192 +955,192 @@ define i32 @masked_gather(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8
 
 define i32 @masked_scatter(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x i1> %m8, <16 x i1> %m16, <32 x i1> %m32, <64 x i1> %m64) {
 ; SSE2-LABEL: 'masked_scatter'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:31 CodeSize:39 Lat:39 SizeLat:39 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:19 CodeSize:23 Lat:23 SizeLat:23 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:130 CodeSize:162 Lat:162 SizeLat:162 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:65 CodeSize:81 Lat:81 SizeLat:81 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:33 CodeSize:41 Lat:41 SizeLat:41 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:320 CodeSize:384 Lat:384 SizeLat:384 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:160 CodeSize:192 Lat:192 SizeLat:192 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:96 Lat:96 SizeLat:96 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:48 Lat:48 SizeLat:48 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SSE42-LABEL: 'masked_scatter'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:29 Lat:29 SizeLat:29 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:15 Lat:15 SizeLat:15 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:31 Lat:31 SizeLat:31 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:260 Lat:260 SizeLat:260 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:98 CodeSize:130 Lat:130 SizeLat:130 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX1-LABEL: 'masked_scatter'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:108 CodeSize:140 Lat:140 SizeLat:140 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:214 CodeSize:278 Lat:278 SizeLat:278 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; AVX2-LABEL: 'masked_scatter'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKL-LABEL: 'masked_scatter'
-; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:26 CodeSize:34 Lat:34 SizeLat:34 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:37 Lat:37 SizeLat:37 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:19 Lat:19 SizeLat:19 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKL-NEXT:  Cost Model: Found costs of RThru:107 CodeSize:139 Lat:139 SizeLat:139 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:70 Lat:70 SizeLat:70 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKL-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKL-NEXT:  Cost Model: Found costs of RThru:212 CodeSize:276 Lat:276 SizeLat:276 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKL-NEXT:  Cost Model: Found costs of RThru:106 CodeSize:138 Lat:138 SizeLat:138 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKL-NEXT:  Cost Model: Found costs of RThru:53 CodeSize:69 Lat:69 SizeLat:69 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKL-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:35 Lat:35 SizeLat:35 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; KNL-LABEL: 'masked_scatter'
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:22 Lat:22 SizeLat:22 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; KNL-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; KNL-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; KNL-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; KNL-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; KNL-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; KNL-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
 ; SKX-LABEL: 'masked_scatter'
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> undef, i32 1, <1 x i1> %m1)
-; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 1, <2 x i1> %m2)
-; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
-; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 1, <4 x i1> %m4)
-; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> undef, i32 1, <64 x i1> %m64)
-; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> %m32)
-; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> %m16)
-; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1f64.v1p0(<1 x double> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i64.v8p0(<8 x i64> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:3 SizeLat:3 for: call void @llvm.masked.scatter.v1i64.v1p0(<1 x i64> undef, <1 x ptr> align 1 undef, <1 x i1> %m1)
+; SKX-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> align 1 undef, <2 x i1> %m2)
+; SKX-NEXT:  Cost Model: Found costs of RThru:143 CodeSize:175 Lat:175 SizeLat:175 for: call void @llvm.masked.scatter.v32i16.v32p0(<32 x i16> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:71 CodeSize:87 Lat:87 SizeLat:87 for: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
+; SKX-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> align 1 undef, <4 x i1> %m4)
+; SKX-NEXT:  Cost Model: Found costs of RThru:283 CodeSize:347 Lat:347 SizeLat:347 for: call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> undef, <64 x ptr> align 1 undef, <64 x i1> %m64)
+; SKX-NEXT:  Cost Model: Found costs of RThru:141 CodeSize:173 Lat:173 SizeLat:173 for: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> align 1 undef, <32 x i1> %m32)
+; SKX-NEXT:  Cost Model: Found costs of RThru:70 CodeSize:86 Lat:86 SizeLat:86 for: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> align 1 undef, <16 x i1> %m16)
+; SKX-NEXT:  Cost Model: Found costs of RThru:35 CodeSize:43 Lat:43 SizeLat:43 for: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> align 1 undef, <8 x i1> %m8)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 0
 ;
   call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> undef, <8 x ptr> undef, i32 1, <8 x i1> %m8)
@@ -1571,22 +1571,22 @@ define i32 @masked_compressstore(<1 x i1> %m1, <2 x i1> %m2, <4 x i1> %m4, <8 x
 define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 ; SSE2-LABEL: 'test1'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:5 Lat:4 SizeLat:5 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test1'
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX-LABEL: 'test1'
 ; AVX-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test1'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i64> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x double> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
@@ -1597,22 +1597,22 @@ define <2 x double> @test1(<2 x i64> %trigger, ptr %addr, <2 x double> %dst) {
 define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 ; SSE2-LABEL: 'test2'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test2'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX-LABEL: 'test2'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 2 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX512-LABEL: 'test2'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 %addr, <4 x i1> %mask, <4 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1623,22 +1623,22 @@ define <4 x i32> @test2(<4 x i32> %trigger, ptr %addr, <4 x i32> %dst) {
 define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 ; SSE2-LABEL: 'test3'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:22 CodeSize:26 Lat:26 SizeLat:26 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test3'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test3'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 8 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test3'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 4, <4 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr align 4 %addr, <4 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
@@ -1649,32 +1649,32 @@ define void @test3(<4 x i32> %trigger, ptr %addr, <4 x i32> %val) {
 define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 ; SSE2-LABEL: 'test4'
 ; SSE2-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:37 CodeSize:45 Lat:45 SizeLat:45 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SSE42-LABEL: 'test4'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:30 CodeSize:38 Lat:38 SizeLat:38 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX1-LABEL: 'test4'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:5 Lat:2 SizeLat:6 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX1-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX2-LABEL: 'test4'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX2-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; SKL-LABEL: 'test4'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:1 SizeLat:2 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; SKL-NEXT:  Cost Model: Found costs of 2 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
 ; AVX512-LABEL: 'test4'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <8 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 1 for: %res = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 %addr, <8 x i1> %mask, <8 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x float> %res
 ;
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
@@ -1685,22 +1685,22 @@ define <8 x float> @test4(<8 x i32> %trigger, ptr %addr, <8 x float> %dst) {
 define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 ; SSE2-LABEL: 'test5'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test5'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test5'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test5'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2f32.p0(<2 x float> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1711,22 +1711,22 @@ define void @test5(<2 x i32> %trigger, ptr %addr, <2 x float> %val) {
 define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 ; SSE2-LABEL: 'test6'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test6'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test6'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of 9 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test6'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %addr, i32 4, <2 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr align 4 %addr, <2 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1737,22 +1737,22 @@ define void @test6(<2 x i32> %trigger, ptr %addr, <2 x i32> %val) {
 define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 ; SSE2-LABEL: 'test7'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; SSE42-LABEL: 'test7'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX-LABEL: 'test7'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
 ; AVX512-LABEL: 'test7'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x float> @llvm.masked.load.v2f32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x float> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1763,22 +1763,22 @@ define <2 x float> @test7(<2 x i32> %trigger, ptr %addr, <2 x float> %dst) {
 define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 ; SSE2-LABEL: 'test8'
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:12 Lat:12 SizeLat:12 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; SSE42-LABEL: 'test8'
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX-LABEL: 'test8'
 ; AVX-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX-NEXT:  Cost Model: Found costs of 3 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
 ; AVX512-LABEL: 'test8'
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
-; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
+; AVX512-NEXT:  Cost Model: Found costs of 2 for: %res = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 %addr, <2 x i1> %mask, <2 x i32> %dst)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %res
 ;
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
@@ -1788,27 +1788,27 @@ define <2 x i32> @test8(<2 x i32> %trigger, ptr %addr, <2 x i32> %dst) {
 
 define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0)  {
 ; SSE2-LABEL: 'test_gather_2f64'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:10 SizeLat:10 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SSE42-LABEL: 'test_gather_2f64'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX1-LABEL: 'test_gather_2f64'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX2-LABEL: 'test_gather_2f64'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:8 Lat:8 SizeLat:8 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; SKL-LABEL: 'test_gather_2f64'
-; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
 ; AVX512-LABEL: 'test_gather_2f64'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:9 SizeLat:9 for: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 %ptrs, <2 x i1> %mask, <2 x double> %src0)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %res
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
@@ -1817,31 +1817,31 @@ define <2 x double> @test_gather_2f64(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x doub
 
 define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32'
-; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32'
-; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> %mask, <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
@@ -1850,31 +1850,31 @@ define <4 x i32> @test_gather_4i32(<4 x ptr> %ptrs, <4 x i1> %mask, <4 x i32> %s
 
 define <4 x i32> @test_gather_4i32_const_mask(<4 x ptr> %ptrs, <4 x i32> %src0)  {
 ; SSE2-LABEL: 'test_gather_4i32_const_mask'
-; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE2-NEXT:  Cost Model: Found costs of 19 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SSE42-LABEL: 'test_gather_4i32_const_mask'
-; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SSE42-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX1-LABEL: 'test_gather_4i32_const_mask'
-; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX1-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; AVX2-LABEL: 'test_gather_4i32_const_mask'
-; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; AVX2-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKL-LABEL: 'test_gather_4i32_const_mask'
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; KNL-LABEL: 'test_gather_4i32_const_mask'
-; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; KNL-NEXT:  Cost Model: Found costs of 13 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
 ; SKX-LABEL: 'test_gather_4i32_const_mask'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 true), <4 x i32> %src0)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %ptrs, <4 x i1> splat (i1 true), <4 x i32> %src0)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %res
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
@@ -1885,37 +1885,37 @@ define <16 x float> @test_gather_16f32_const_mask(ptr %base, <16 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_16f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1929,37 +1929,37 @@ define <16 x float> @test_gather_16f32_var_mask(ptr %base, <16 x i32> %ind, <16
 ; SSE2-LABEL: 'test_gather_16f32_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %base, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -1973,37 +1973,37 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x ptr> %ptrs, <16 x i32>
 ; SSE2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:61 CodeSize:77 Lat:77 SizeLat:77 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:45 CodeSize:61 Lat:61 SizeLat:61 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:51 CodeSize:67 Lat:67 SizeLat:67 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_ra_var_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_ra_var_mask'
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, <16 x ptr> %ptrs, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:2 Lat:20 SizeLat:20 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.v, <16 x i1> %mask, <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
@@ -2019,7 +2019,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:8 SizeLat:8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 60 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_16f32_const_mask2'
@@ -2027,7 +2027,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 8 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 44 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_16f32_const_mask2'
@@ -2035,7 +2035,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_16f32_const_mask2'
@@ -2043,7 +2043,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 50 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_16f32_const_mask2'
@@ -2051,7 +2051,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:24 SizeLat:24 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
 ; AVX512-LABEL: 'test_gather_16f32_const_mask2'
@@ -2059,7 +2059,7 @@ define <16 x float> @test_gather_16f32_const_mask2(ptr %base, <16 x i32> %ind) {
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <16 x i32> %ind to <16 x i64>
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> splat (i1 true), <16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 %gep.random, <16 x i1> splat (i1 true), <16 x float> undef)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x float> %res
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2078,7 +2078,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:77 CodeSize:93 Lat:93 SizeLat:93 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_16i32'
@@ -2086,7 +2086,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SSE42-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:49 CodeSize:65 Lat:65 SizeLat:65 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX1-LABEL: 'test_scatter_16i32'
@@ -2094,7 +2094,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:2 Lat:3 SizeLat:3 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX1-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX2-LABEL: 'test_scatter_16i32'
@@ -2102,7 +2102,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX2-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKL-LABEL: 'test_scatter_16i32'
@@ -2110,7 +2110,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; SKL-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:2 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; SKL-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; SKL-NEXT:  Cost Model: Found costs of RThru:55 CodeSize:71 Lat:71 SizeLat:71 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_16i32'
@@ -2118,7 +2118,7 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
 ; AVX512-NEXT:  Cost Model: Found costs of 0 for: %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind
 ; AVX512-NEXT:  Cost Model: Found costs of 1 for: %imask = bitcast i16 %mask to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %val, <16 x ptr> align 4 %gep.random, <16 x i1> %imask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
@@ -2132,19 +2132,19 @@ define void @test_scatter_16i32(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32
 
 define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_8i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:39 CodeSize:47 Lat:47 SizeLat:47 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_8i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:25 CodeSize:33 Lat:33 SizeLat:33 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_8i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:36 Lat:36 SizeLat:36 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX512-LABEL: 'test_scatter_8i32'
-; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
+; AVX512-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> align 4 %ptr, <8 x i1> %mask)
 ; AVX512-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> %mask)
@@ -2153,23 +2153,23 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x ptr> %ptr, <8 x i1>%mask) {
 
 define void @test_scatter_4i32(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_scatter_4i32'
-; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:24 Lat:24 SizeLat:24 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SSE42-LABEL: 'test_scatter_4i32'
-; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; AVX-LABEL: 'test_scatter_4i32'
-; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; AVX-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:18 Lat:18 SizeLat:18 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; AVX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; KNL-LABEL: 'test_scatter_4i32'
-; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; KNL-NEXT:  Cost Model: Found costs of RThru:17 CodeSize:21 Lat:21 SizeLat:21 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; SKX-LABEL: 'test_scatter_4i32'
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> align 4 %ptr, <4 x i1> %mask)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask)
@@ -2180,43 +2180,43 @@ define <4 x float> @test_gather_4f32(ptr %ptr, <4 x i32> %ind, <4 x i1>%mask) {
 ; SSE2-LABEL: 'test_gather_4f32'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:16 Lat:16 SizeLat:16 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:17 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:20 SizeLat:20 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> %mask, <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
@@ -2230,43 +2230,43 @@ define <4 x float> @test_gather_4f32_const_mask(ptr %ptr, <4 x i32> %ind) {
 ; SSE2-LABEL: 'test_gather_4f32_const_mask'
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:2 SizeLat:2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE2-NEXT:  Cost Model: Found costs of 15 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SSE42-LABEL: 'test_gather_4f32_const_mask'
 ; SSE42-NEXT:  Cost Model: Found costs of 2 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SSE42-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SSE42-NEXT:  Cost Model: Found costs of 11 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SSE42-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX1-LABEL: 'test_gather_4f32_const_mask'
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX1-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX1-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; AVX2-LABEL: 'test_gather_4f32_const_mask'
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; AVX2-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; AVX2-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; AVX2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKL-LABEL: 'test_gather_4f32_const_mask'
 ; SKL-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKL-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; KNL-LABEL: 'test_gather_4f32_const_mask'
 ; KNL-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; KNL-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; KNL-NEXT:  Cost Model: Found costs of 12 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; KNL-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
 ; SKX-LABEL: 'test_gather_4f32_const_mask'
 ; SKX-NEXT:  Cost Model: Found costs of 1 for: %sext_ind = sext <4 x i32> %ind to <4 x i64>
 ; SKX-NEXT:  Cost Model: Found costs of 0 for: %gep.v = getelementptr float, ptr %ptr, <4 x i64> %sext_ind
-; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.v, i32 4, <4 x i1> splat (i1 true), <4 x float> undef)
+; SKX-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:6 SizeLat:6 for: %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 %gep.v, <4 x i1> splat (i1 true), <4 x float> undef)
 ; SKX-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %res
 ;
   %sext_ind = sext <4 x i32> %ind to <4 x i64>
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
new file mode 100644
index 0000000..e43d00d
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-cast.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_nsw(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {(sext i8 %start to i64),+,(sext i8 %step to i64)}<nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The addition for `%offset.next` can wrap, so we cannot prove monotonicity.
+;
+; int8_t offset = start;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[sext(offset)] = 0;
+;
+define void @sext_may_wrap(ptr %a, i8 %start, i8 %step) {
+; CHECK-LABEL: 'sext_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (sext i8 {%start,+,%step}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ %start, %entry ], [ %offset.next, %loop ]
+  %offset.sext = sext i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.sext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int8_t i = 0; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_pos(ptr %a) {
+; CHECK-LABEL: 'zext_pos'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The zero-extened value of `offset` is no longer monotonic. In fact, the
+; values of `offset` in each iteration are:
+;
+;    iteration |   0 | 1 | 2 | ...
+; -------------|-----|---|---|---------
+;       offset |  -1 | 0 | 1 | ...
+; zext(offset) | 255 | 0 | 1 | ...
+;
+;
+; for (int8_t i = -1; i < 100; i++)
+;   a[zext(offset)] = 0;
+;
+define void @zext_cross_zero(ptr %a) {
+; CHECK-LABEL: 'zext_cross_zero'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {-1,+,1}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i8 [ -1, %entry ], [ %i.inc, %loop ]
+  %offset.zext = zext nneg i8 %i to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i8 %i, 1
+  %exitcond = icmp eq i8 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; In principle, we can prove that `zext(offset)` is monotonic since we know
+; that `offset` is non-negative.
+;
+; int8_t offset = 0;
+; for (int i = 0; i < 100; i++, offset += step)
+;   a[zext(offset)] = 0;
+;
+define void @zext_nneg_nsw(ptr %a, i8 %step) {
+; CHECK-LABEL: 'zext_nneg_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i8 {0,+,%step}<nsw><%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i8 [ 0, %entry ], [ %offset.next, %loop ]
+  %offset.zext = zext nneg i8 %offset to i64
+  %idx = getelementptr i8, ptr %a, i64 %offset.zext
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %offset.next = add nsw i8 %offset, %step
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; SCEV handles `i & 1` as an i1 addrec. Ensure that the monotonicity analysis
+; properly analyzes it.
+;
+; for (i = 0; i < 100; i++)
+;  a[i & 1] = 0;
+;
+define void @offset_truncated_to_i1(ptr %a) {
+; CHECK-LABEL: 'offset_truncated_to_i1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: (zext i1 {false,+,true}<%loop> to i64)
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %and = and i64 %i, 1
+  %idx = getelementptr inbounds i8, ptr %a, i64 %and
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, 100
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
new file mode 100644
index 0000000..71ea4e9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-delinearize.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; The offset SCEV will be delinearized into a 2D array access, like as follows:
+;
+; - Outer subscript: {0,+,1}<nuw><nsw><%loop.i.header>
+; - Inner subscript: {0,+,1}<nuw><nsw><%loop.j.header>
+;
+; These subscripts are both monotonic, but we also need to check the
+; monotonicity of the original addrec.
+;
+; char A[...][32];
+; for (i = 0; i < 1ll << 62; i++)
+;   for (j = 0; j < 32; j++)
+;     if (i < (1ll << 57))
+;       A[i][j] = 0;
+;
+define void @linearized_offset_wrap(ptr %a) {
+; CHECK-LABEL: 'linearized_offset_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,32}<%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %cond = icmp slt i64 %i, 144115188075855872  ; 2^57
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds [32 x i8], ptr %a, i64 %i, i64 %j
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
new file mode 100644
index 0000000..e5b6ddb
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-invariant.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[x] = 0;
+define void @single_loop_invariant(ptr %a, i64 %x, i64 %n) {
+; CHECK-LABEL: 'single_loop_invariant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %x
+; CHECK-NEXT:      Monotonicity: Invariant
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S]!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %x
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   a[(i % 2 == 0 ? x : y)] = 0;
+define void @single_loop_variant(ptr %a, i64 %x, i64 %y, i64 %n) {
+; CHECK-LABEL: 'single_loop_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: %offset
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: %offset
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.next, %loop ]
+  %offset.next = phi i64 [ %y, %entry ], [ %offset, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + i] = 0;
+define void @invariant_plus_monotonic0(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.i.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [0 S]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset = phi i64 [ %x, %entry ], [ %offset.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[x + j] = 0;
+define void @invariant_plus_monotonic1(ptr %a, i64 %x, i64 %n, i64 %m) {
+; CHECK-LABEL: 'invariant_plus_monotonic1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%x,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - consistent output [S 0]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = phi i64 [ %x, %loop.j.preheader ], [ %offset.inc, %loop.j ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nuw nsw i64 %j, 1
+  %offset.inc = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
new file mode 100644
index 0000000..7411dc9
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+
+; for (int i = 0; i < n; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nsw(ptr %a, i64 %n) {
+; CHECK-LABEL: 'single_loop_nsw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %guard = icmp sgt i64 %n, 0
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr inbounds i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; The purpose of the variable `begin` is to avoid violating the size limitation
+; of the allocated object in LLVM IR, which would cause UB.
+;
+; for (unsigned long long i = begin; i < end; i++)
+;   a[i] = 0;
+;
+define void @single_loop_nuw(ptr %a, i64 %begin, i64 %end) {
+; CHECK-LABEL: 'single_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%begin,+,1}<nuw><%loop>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {%begin,+,1}<nuw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard = icmp ult i64 %begin, %end
+  br i1 %guard, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ %begin, %entry ], [ %i.inc, %loop ]
+  %idx = getelementptr i8, ptr %a, i64 %i
+  store i8 0, ptr %idx
+  %i.inc = add nuw i64 %i, 1
+  %exitcond = icmp eq i64 %i.inc, %end
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = n - 1; i >= 0; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i + j] = 0;
+;
+define void @nested_loop_nsw1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(-1 + %n),+,-1}<nsw><%loop.i.header>,+,1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %n, %entry ], [ %i.dec, %loop.i.latch ]
+  %i.dec = add nsw i64 %i, -1
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %i.dec, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %exitcond.i = icmp eq i64 %i.dec, 0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i--)
+;   for (int j = 0; j < m; j++)
+;     a[i - j] = 0;
+;
+define void @nested_loop_nsw2(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'nested_loop_nsw2'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,-1}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = sub nsw i64 %i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = begin0; i < end0; i++)
+;   for (int j = begin1; j < end1; j++) {
+;     unsigned long long offset = (unsigned long long)i + (unsigned long long)j;
+;     a[offset] = 0;
+;   }
+;
+define void @nested_loop_nuw(ptr %a, i64 %begin0, i64 %end0, i64 %begin1, i64 %end1) {
+; CHECK-LABEL: 'nested_loop_nuw'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}(%begin0 + %begin1),+,1}<nw><%loop.i.header>,+,1}<nw><%loop.j>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i.0 = icmp slt i64 0, %begin0
+  %guard.i.1 = icmp slt i64 %begin0, %end0
+  %guard.i.2 = icmp slt i64 0, %end0
+  %and.i.0 = and i1 %guard.i.0, %guard.i.1
+  %and.i.1 = and i1 %and.i.0, %guard.i.2
+  br i1 %and.i.1, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ %begin0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %guard.j.0 = icmp slt i64 0, %begin1
+  %guard.j.1 = icmp slt i64 %begin1, %end1
+  %guard.j.2 = icmp slt i64 0, %end1
+  %and.j.0 = and i1 %guard.j.0, %guard.j.1
+  %and.j.1 = and i1 %and.j.0, %guard.j.2
+  br i1 %and.j.1, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ %begin1, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset = add nuw i64 %i, %j
+  %idx = getelementptr i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %end1
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %end0
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     a[i + step*j] = 0;
+;
+define void @nested_loop_step(ptr %a, i64 %n, i64 %m, i64 %step) {
+; CHECK-LABEL: 'nested_loop_step'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,%step}<nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j, label %loop.i.latch
+
+loop.j:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j ]
+  %offset.j = phi i64 [ 0, %loop.j.preheader ], [ %offset.j.next, %loop.j ]
+  %offset = add nsw i64 %i, %offset.j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %offset.j.next = add nsw i64 %offset.j, %step
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop).
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+;   for (int j = 0; j < 100; j++)
+;     a[offset_i + j] = 0;
+;   offset_i += (i % 2 == 0) ? 0 : 3;
+; }
+;
+define void @step_is_variant(ptr %a) {
+; CHECK-LABEL: 'step_is_variant'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+  %step.i.0 = phi i64 [ 0, %entry ], [ %step.i.1, %loop.i.latch ]
+  %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+  %offset = add nsw i64 %offset.i, %j
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, 100
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %offset.i.next = add nsw i64 %offset.i, %step.i.0
+  %exitcond.i = icmp eq i64 %i.inc, 100
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
+; executed.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store0(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store0'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nw><%loop.j.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = add nsw i64 %i, %j
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; Similar to the @conditional_store0, but the definition of the `%offset` is
+; different from it and we can infer `nsw` in this case.
+;
+; for (int i = 0; i < n; i++)
+;   for (int j = 0; j < m; j++)
+;     if (cond)
+;       a[i + j] = 0;
+;
+define void @conditional_store1(ptr %a, i64 %n, i64 %m) {
+; CHECK-LABEL: 'conditional_store1'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}0,+,1}<nuw><nsw><%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT:    da analyze - output [* *]!
+;
+entry:
+  %guard.i = icmp sgt i64 %n, 0
+  br i1 %guard.i, label %loop.i.header, label %exit
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  br label %loop.j.preheader
+
+loop.j.preheader:
+  %gurard.j = icmp sgt i64 %m, 0
+  br i1 %gurard.j, label %loop.j.header, label %loop.i.latch
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.j.preheader ], [ %j.inc, %loop.j.latch ]
+  %offset = phi i64 [ %i, %loop.j.preheader ], [ %offset.next, %loop.j.latch ]
+  %cond = freeze i1 poison
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+  store i8 0, ptr %idx
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nsw i64 %j, 1
+  %offset.next = add nsw i64 %offset, 1
+  %exitcond.j = icmp eq i64 %j.inc, %m
+  br i1 %exitcond.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nsw i64 %i, 1
+  %exitcond.i = icmp eq i64 %i.inc, %n
+  br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
+
+; In the following case, the computation `offset = offset_i + j` will not wrap,
+; but `offset_i += 1024` will wrap both in a signed sense and an unsigned
+; sense. We cannot prove the monotonicity in this case.
+;
+; offset_i = (1ULL << 63) - 256;
+; for (i = 0; i < (1ULL << 62); i++, offset_i += 1024)
+;   for (j = 0; j < 32; j++) {
+;     offset = offset_i + j;
+;
+;     // The value of `offset` is positive in a signed sense.
+;     if (offset < (1ULL << 63))
+;       a[offset] = 0;
+;   }
+;
+define void @outer_loop_may_wrap(ptr %a) {
+; CHECK-LABEL: 'outer_loop_may_wrap'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:      Expr: {{\{\{}}9223372036854775552,+,1024}<%loop.i.header>,+,1}<nuw><nsw><%loop.j.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {9223372036854775552,+,1024}<%loop.i.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 0, ptr %gep, align 1 --> Dst: store i8 0, ptr %gep, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+entry:
+  br label %loop.i.header
+
+loop.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+  %subscript.i = phi i64 [ 9223372036854775552, %entry ], [ %subscript.i.next, %loop.i.latch ]  ; The initial value is 2^63 - 256
+  br label %loop.j.header
+
+loop.j.header:
+  %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j.latch ]
+  %subscript = phi i64 [ %subscript.i, %loop.i.header ], [ %subscript.next, %loop.j.latch ]
+  %cond = icmp sge i64 %subscript, 0
+  br i1 %cond, label %if.then, label %loop.j.latch
+
+if.then:
+  %gep = getelementptr inbounds i8, ptr %a, i64 %subscript
+  store i8 0, ptr %gep
+  br label %loop.j.latch
+
+loop.j.latch:
+  %j.inc = add nuw nsw i64 %j, 1
+  %subscript.next = add nuw nsw i64 %subscript, 1
+  %ec.j = icmp eq i64 %j.inc, 32
+  br i1 %ec.j, label %loop.i.latch, label %loop.j.header
+
+loop.i.latch:
+  %i.inc = add nuw nsw i64 %i, 1
+  %subscript.i.next = add i64 %subscript.i, 1024
+  %ec.i = icmp eq i64 %i.inc, 4611686018427387904  ; 2^62
+  br i1 %ec.i, label %exit, label %loop.i.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
new file mode 100644
index 0000000..6247336
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/non-monotonic.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output -passes="print<da>" -da-dump-monotonicity-report \
+; RUN:     -da-enable-monotonicity-check 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -passes="print<da>" 2>&1 | FileCheck %s -check-prefix=DISABLE-CHECK
+
+;
+; for (i = 0; i < (1ULL << 60); i++) {
+;   A[i] = 1;
+;
+;   unsigned long long offset = i * 32 + (1ULL << 62);
+;   // offset is positive when interpreted as a signed value.
+;   // To prevent violating the size limitation for an allocated object.
+;   if (offset < (1ULL << 63))
+;     A[offset] = 2;
+; }
+;
+; -----------------------------------------------------------------------------
+;
+; There is a dependency between the two stores. To detect it, we need to check
+; the monotonicity and bail out the analysis since `offset` is not monotonic.
+;
+;  memory location  | first store (A[i]) | second store (A[offset])
+; ------------------|--------------------|----------------------------
+;  A[0]             | i = 0              | i = 2^59 - 2^57
+;  A[2^60 - 32]     | i = 2^60 - 32      | i = 2^59 - 2^57 + 2^55 - 1
+;
+define void @f(ptr %A) {
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:  Monotonicity check:
+; CHECK-NEXT:    Inst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:      Expr: {0,+,1}<nuw><nsw><%loop.header>
+; CHECK-NEXT:      Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT:    Inst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:      Expr: {4611686018427387904,+,32}<%loop.header>
+; CHECK-NEXT:      Monotonicity: Unknown
+; CHECK-NEXT:      Reason: {4611686018427387904,+,32}<%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - confused!
+;
+; DISABLE-CHECK-LABEL: 'f'
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+; DISABLE-CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; DISABLE-CHECK-NEXT:    da analyze - none!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %i
+  store i8 1, ptr %idx.0
+  %offset.tmp = mul i64 %i, 32
+  %offset = add i64 %offset.tmp, 4611686018427387904 ; 1ULL << 62
+  %if.cond = icmp sge i64 %offset, 0
+  br i1 %if.cond, label %if.then, label %loop.latch
+
+if.then:
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %offset
+  store i8 2, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %exit.cond = icmp eq i64 %i.next, 1152921504606846976 ; 1ULL << 60
+  br i1 %exit.cond, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
index 3c3748a..1964fca 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info.ll
@@ -104,3 +104,56 @@ exit:
 }
 
 declare void @clobber()
+
+
+declare void @clobber.i32(i32)
+
+define void @test_guards_across_loops(i32 %N) {
+; CHECK-LABEL: 'test_guards_across_loops'
+; CHECK-NEXT:  Classifying expressions for: @test_guards_across_loops
+; CHECK-NEXT:    %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+; CHECK-NEXT:    --> {0,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.1.next = add i32 %iv.1, 1
+; CHECK-NEXT:    --> {1,+,1}<%loop.1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop.1: Computable }
+; CHECK-NEXT:    %iv.2 = phi i32 [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><%loop.2> U: full-set S: full-set Exits: (1 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:    %iv.2.next = add i32 %iv.2, 1
+; CHECK-NEXT:    --> {1,+,1}<nw><%loop.2> U: full-set S: full-set Exits: (2 + %N) LoopDispositions: { %loop.2: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_guards_across_loops
+; CHECK-NEXT:  Loop %loop.2: backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:  Loop %loop.2: symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:  Loop %loop.2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop.1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop.1: Predicated backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated constant max backedge-taken count is i64 4294967296
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop.1: Predicated symbolic max backedge-taken count is (1 + (zext i32 %N to i64))<nuw><nsw>
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop.1> Added Flags: <nusw>
+;
+entry:
+  br label %loop.1
+
+loop.1:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %loop.1 ]
+  call void @clobber.i32(i32 %iv.1)
+  %ec.1 = icmp ugt i32 %iv.1, %N
+  %iv.1.next = add i32 %iv.1, 1
+  br i1 %ec.1, label %loop.2, label %loop.1
+
+loop.2:
+  %iv.2 = phi i32  [ 0, %loop.1 ], [ %iv.2.next, %loop.2 ]
+  call void @clobber.i32(i32 %iv.2)
+  %ec.2 = icmp ugt i32 %iv.2, %N
+  %iv.2.next = add i32 %iv.2, 1
+  br i1 %ec.2, label %exit, label %loop.2
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
index 4024c98..d5a2181 100644
--- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll
@@ -1431,7 +1431,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_load(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1470,7 +1470,7 @@ define void @ptr_induction_early_exit_eq_1_with_align_on_arguments(ptr align 8 %
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a to i64)) + (ptrtoint ptr %b to i64)) /u 8)
@@ -1511,7 +1511,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_1(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
@@ -1556,7 +1556,7 @@ define void @ptr_induction_early_exit_eq_1_align_assumption_2(ptr %a, ptr %b, pt
 ; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
 ; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693951
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 2305843009213693950
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for loop.inc: ((-8 + (-1 * (ptrtoint ptr %a_ to i64)) + (ptrtoint ptr %b_ to i64)) /u 8)
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index a477465c..f35c48b 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -717,5 +717,46 @@ exit:
   ret void
 }
 
+define void @test_urem_non_constant(ptr %dst, i32 %a, i32 %b) {
+; CHECK-LABEL: 'test_urem_non_constant'
+; CHECK-NEXT:  Classifying expressions for: @test_urem_non_constant
+; CHECK-NEXT:    %rem = urem i32 %a, %b
+; CHECK-NEXT:    --> ((-1 * (%a /u %b) * %b) + %a) U: full-set S: full-set
+; CHECK-NEXT:    %and.0 = and i1 %pre.0, %pre.1
+; CHECK-NEXT:    --> (%pre.1 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %and.1 = and i1 %and.0, %pre.2
+; CHECK-NEXT:    --> (%pre.1 umin %pre.2 umin %pre.0) U: full-set S: full-set
+; CHECK-NEXT:    %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+; CHECK-NEXT:    --> {0,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+; CHECK-NEXT:    --> ((sext i32 %b to i64) + %dst) U: full-set S: full-set Exits: ((sext i32 %b to i64) + %dst) LoopDispositions: { %loop: Invariant }
+; CHECK-NEXT:    %iv.next = add i32 %iv, %b
+; CHECK-NEXT:    --> {%b,+,%b}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_urem_non_constant
+; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  %rem = urem i32 %a, %b
+  %pre.0 = icmp eq i32 %rem, 0
+  %pre.1 = icmp ne i32 %a, 0
+  %pre.2 = icmp ne i32 %b, 0
+  %and.0 = and i1 %pre.0, %pre.1
+  %and.1 = and i1 %and.0, %pre.2
+  br i1 %and.1, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %b
+  store i8 0, ptr %gep.dst
+  %iv.next = add i32 %iv, %b
+  %ec = icmp ne i32 %iv.next, %a
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 declare void @llvm.assume(i1)
 declare void @llvm.experimental.guard(i1, ...)
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index dc4a72e..d148b9e 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -10,8 +10,8 @@ define <8 x i16> @test0(ptr %p, ptr %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt
 ; CHECK-LABEL: define <8 x i16> @test0(
 ; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], <8 x i16> [[Y:%.*]], <8 x i1> [[M:%.*]], <8 x i16> [[PT:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P]], i32 16, <8 x i1> [[M]], <8 x i16> [[PT]]) #[[ATTR2:[0-9]+]], !tbaa [[B_TBAA0:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr [[Q]], i32 16, <8 x i1> [[M]]), !tbaa [[A_TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[A:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 16 [[P]], <8 x i1> [[M]], <8 x i16> [[PT]]), !tbaa [[B_TBAA0:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[Y]], ptr align 16 [[Q]], <8 x i1> [[M]]), !tbaa [[A_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = add <8 x i16> [[A]], [[A]]
 ; CHECK-NEXT:    ret <8 x i16> [[C]]
 ;
diff --git a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
index bd0da02..e24db99 100644
--- a/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
+++ b/llvm/test/Assembler/masked-load-store-intrinsics-attributes.ll
@@ -4,11 +4,11 @@
 ; Specifically `captures(none)' should be added to the pointer parameters for
 ; the loads/stores
 
-; CHECK: declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr captures(none), i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+; CHECK: declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr captures(none), <vscale x 2 x i1>, <vscale x 2 x i64>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr, <vscale x 2 x i1>, <vscale x 2 x i64>)
 
-; CHECK: declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr captures(none), i32 immarg, <vscale x 2 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
+; CHECK: declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr captures(none), <vscale x 2 x i1>) [[ARGMEMONLY_NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_WILLRETURN_WRITEONLY:#[0-9]+]]
+declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr, <vscale x 2 x i1>)
 
 ; CHECK: declare <16 x float> @llvm.masked.expandload.v16f32(ptr captures(none), <16 x i1>, <16 x float>) [[NOCALLBACK_NOFREE_NOSYNC_NOUNWIND_READONLY_WILLRETURN:#[0-9]+]]
 declare <16 x float> @llvm.masked.expandload.v16f32 (ptr, <16 x i1>, <16 x float>)
diff --git a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
index a4667ab..0c44b30 100644
--- a/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
+++ b/llvm/test/Bitcode/upgrade-masked-keep-metadata.ll
@@ -4,7 +4,7 @@ define <4 x i32> @load(ptr nocapture readonly %a0) !dbg !8 {
 ; CHECK-LABEL: define <4 x i32> @load(
 ; CHECK-SAME: ptr readonly captures(none) [[A0:%.*]]) !dbg [[DBG8:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[CHAR_TBAA20:![0-9]+]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> undef), !dbg [[DBG19:![0-9]+]], !tbaa [[CHAR_TBAA20:![0-9]+]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG23:![0-9]+]]
 ;
 entry:
@@ -16,7 +16,7 @@ define void @store(<4 x i32> %a0, ptr nocapture %a1) !dbg !24 {
 ; CHECK-LABEL: define void @store(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], ptr captures(none) [[A1:%.*]]) !dbg [[DBG24:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr align 16 [[A1]], <4 x i1> <i1 false, i1 true, i1 false, i1 true>), !dbg [[DBG30:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG31:![0-9]+]]
 ;
 entry:
@@ -28,7 +28,7 @@ define <4 x i32> @gather(<4 x ptr> %a0) !dbg !32 {
 ; CHECK-LABEL: define <4 x i32> @gather(
 ; CHECK-SAME: <4 x ptr> [[A0:%.*]]) !dbg [[DBG32:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[A0]], i32 16, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 16 [[A0]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> undef), !dbg [[DBG35:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret <4 x i32> [[V0]], !dbg [[DBG36:![0-9]+]]
 ;
 entry:
@@ -40,7 +40,7 @@ define void @scatter(<4 x i32> %a0, <4 x ptr> %a1) !dbg !37 {
 ; CHECK-LABEL: define void @scatter(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x ptr> [[A1:%.*]]) !dbg [[DBG37:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0]], <4 x ptr> [[A1]], i32 16, <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[CHAR_TBAA20]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[A0]], <4 x ptr> align 16 [[A1]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>), !dbg [[DBG41:![0-9]+]], !tbaa [[CHAR_TBAA20]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG42:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b215c51..0933e67 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -1371,11 +1371,10 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
 ; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    dup v1.8h, w9
-; CHECK-SD-NEXT:    fmov s3, w9
-; CHECK-SD-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-SD-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
-; CHECK-SD-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    dup v2.8h, w9
+; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
+; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
 ; CHECK-SD-NEXT:    rev32 v2.8h, v0.8h
 ; CHECK-SD-NEXT:    sqdmull v3.4s, v0.4h, v4.4h
 ; CHECK-SD-NEXT:    sqdmull2 v0.4s, v0.8h, v4.8h
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index 284d624..f34d3ed 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -1,16 +1,12 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
-# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
-# RUN:    | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,-zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-NOZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-NOZCZ-GPR32-ZCZ-GPR64 %s
+# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcz-gpr32,+zcz-gpr64" %s \
+# RUN:    | FileCheck --check-prefix=CHECK-ZCZ-GPR32-ZCZ-GPR64 %s
 
 --- |
   define void @f0(i64 noundef %x) { ret void }
@@ -24,41 +20,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, undef $xzr, implicit $wzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $w0 = ORRWrr $wzr, $wzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f0
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $w0 = MOVZWi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = MOVZWi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $w0 = COPY $wzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
@@ -69,41 +53,29 @@ liveins:
 body:             |
   bb.0:
     liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
-    ;
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-NOZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: $x0 = ORRXrr $xzr, $xzr
-    ; CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: $x0 = ORRXrr $xzr, $xzr
+    ; CHECK-ZCZ-GPR32-NOZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-NO-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-NO-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-NO-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-NO-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-NO-ZCM-ZCZ-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
-    ; CHECK-ZCM-ZCZ-LABEL: name: f1
-    ; CHECK-ZCM-ZCZ: liveins: $x0, $lr
-    ; CHECK-ZCM-ZCZ-NEXT: {{  $}}
-    ; CHECK-ZCM-ZCZ-NEXT: $x0 = MOVZXi 0, 0
-    ; CHECK-ZCM-ZCZ-NEXT:BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f1
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
+    ; CHECK-ZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     $x0 = COPY $xzr
     BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
 ...
diff --git a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
index 97a7741..849323f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -421,3 +421,83 @@ for.body51:                                       ; preds = %is_sbox.exit155
   unreachable
 }
 declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, ptr nocapture) nounwind ssp
+
+; CHECK-LABEL: fold_imm1_csinc_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: csinc w0, w2, wzr, ge
+; CHECK-NEXT: ret
+define i32 @fold_imm1_csinc_32(i32 %x, i32 %y, i32 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ %n, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_csinc_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: csinc x0, x2, xzr, ge
+; CHECK-NEXT: ret
+define i64 @fold_imm1_csinc_64(i64 %x, i64 %y, i64 %n) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ %n, %if.else ]
+  ret i64 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_32:
+; CHECK:      cmp w0, w1
+; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: ret
+define i32 @fold_imm1_cset_32(i32 %x, i32 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i32 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ 1, %if.then ], [ 0, %if.else ]
+  ret i32 %result
+}
+
+; CHECK-LABEL: fold_imm1_cset_64:
+; CHECK:      cmp x0, x1
+; CHECK-NEXT: cset x0, lt
+; CHECK-NEXT: ret
+define i64 @fold_imm1_cset_64(i64 %x, i64 %y) nounwind ssp {
+entry:
+  %cmp = icmp slt i64 %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  br label %exit
+
+if.else:
+  br label %exit
+
+exit:
+  %result = phi i64 [ 1, %if.then ], [ 0, %if.else ]
+  ret i64 %result
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d4cc154..52ca22b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,38 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,GENERIC
-; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,FAST
-; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
-; RUN:          -mtriple=arm64-eabi -aarch64-neon-syntax=apple \
-; RUN:          | FileCheck %s --check-prefixes=GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple  -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for test_vcvt_bf16_f64
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f64_f32)
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
-; GISEL-NEXT:    ret
   %vcvt1.i = fpext <2 x float> %x to <2 x double>
   ret <2 x double> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32)
 define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_high_f64_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_f64_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
   %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
   ret <2 x double> %vcvt1.i
@@ -43,11 +29,6 @@ define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind r
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %bc1 = bitcast <4 x float> %x to <2 x double>
   %ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x double> %ext to <2 x float>
@@ -60,11 +41,6 @@ define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -76,11 +52,6 @@ define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -92,11 +63,6 @@ define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind rea
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -108,11 +74,6 @@ define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <2 x float>
   %r = fpext <2 x float> %bc2 to <2 x double>
@@ -124,11 +85,6 @@ define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -140,11 +96,6 @@ define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -156,11 +107,6 @@ define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind read
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
@@ -172,134 +118,118 @@ define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <4 x half>
   %r = fpext <4 x half> %bc2 to <4 x float>
   ret <4 x float> %r
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f32_f64)
 define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f32_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvt_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
   ret <2 x float> %vcvt1.i
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
 define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_bf16_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvtxn v0.2s, v0.2d
-; GENERIC-NEXT:    movi.4s v1, #1
-; GENERIC-NEXT:    movi.4s v2, #127, msl #8
-; GENERIC-NEXT:    ushr.4s v3, v0, #16
-; GENERIC-NEXT:    add.4s v2, v0, v2
-; GENERIC-NEXT:    and.16b v1, v3, v1
-; GENERIC-NEXT:    fcmeq.4s v3, v0, v0
-; GENERIC-NEXT:    orr.4s v0, #64, lsl #16
-; GENERIC-NEXT:    add.4s v1, v1, v2
-; GENERIC-NEXT:    bit.16b v0, v1, v3
-; GENERIC-NEXT:    shrn.4h v0, v0, #16
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_bf16_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT:    movi.4s v1, #1
+; CHECK-SD-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-SD-NEXT:    ushr.4s v3, v0, #16
+; CHECK-SD-NEXT:    add.4s v2, v0, v2
+; CHECK-SD-NEXT:    and.16b v1, v3, v1
+; CHECK-SD-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-SD-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-SD-NEXT:    add.4s v1, v1, v2
+; CHECK-SD-NEXT:    bit.16b v0, v1, v3
+; CHECK-SD-NEXT:    shrn.4h v0, v0, #16
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_bf16_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvtxn v1.2s, v0.2d
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    ushr.4s v1, v0, #16
-; FAST-NEXT:    movi.4s v2, #1
-; FAST-NEXT:    and.16b v1, v1, v2
-; FAST-NEXT:    add.4s v1, v1, v0
-; FAST-NEXT:    movi.4s v2, #127, msl #8
-; FAST-NEXT:    add.4s v1, v1, v2
-; FAST-NEXT:    mov.16b v2, v0
-; FAST-NEXT:    orr.4s v2, #64, lsl #16
-; FAST-NEXT:    fcmeq.4s v0, v0, v0
-; FAST-NEXT:    bsl.16b v0, v1, v2
-; FAST-NEXT:    shrn.4h v0, v0, #16
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_bf16_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvtxn v1.2s, v0.2d
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    ushr.4s v1, v0, #16
+; CHECK-FI-NEXT:    movi.4s v2, #1
+; CHECK-FI-NEXT:    and.16b v1, v1, v2
+; CHECK-FI-NEXT:    add.4s v1, v1, v0
+; CHECK-FI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-FI-NEXT:    add.4s v1, v1, v2
+; CHECK-FI-NEXT:    mov.16b v2, v0
+; CHECK-FI-NEXT:    orr.4s v2, #64, lsl #16
+; CHECK-FI-NEXT:    fcmeq.4s v0, v0, v0
+; CHECK-FI-NEXT:    bsl.16b v0, v1, v2
+; CHECK-FI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_bf16_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    movi.4s v1, #1
-; GISEL-NEXT:    movi.4s v2, #127, msl #8
-; GISEL-NEXT:    ushr.4s v3, v0, #16
-; GISEL-NEXT:    add.4s v2, v0, v2
-; GISEL-NEXT:    and.16b v1, v3, v1
-; GISEL-NEXT:    fcmeq.4s v3, v0, v0
-; GISEL-NEXT:    orr.4s v0, #64, lsl #16
-; GISEL-NEXT:    add.4s v1, v1, v2
-; GISEL-NEXT:    bit.16b v0, v1, v3
-; GISEL-NEXT:    shrn.4h v0, v0, #16
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_bf16_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT:    movi.4s v1, #1
+; CHECK-GI-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-GI-NEXT:    ushr.4s v3, v0, #16
+; CHECK-GI-NEXT:    add.4s v2, v0, v2
+; CHECK-GI-NEXT:    and.16b v1, v3, v1
+; CHECK-GI-NEXT:    fcmeq.4s v3, v0, v0
+; CHECK-GI-NEXT:    orr.4s v0, #64, lsl #16
+; CHECK-GI-NEXT:    add.4s v1, v1, v2
+; CHECK-GI-NEXT:    bit.16b v0, v1, v3
+; CHECK-GI-NEXT:    shrn.4h v0, v0, #16
+; CHECK-GI-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
   ret <2 x bfloat> %vcvt1.i
 }
 
 define half @test_vcvt_f16_f32(<1 x float> %x) {
-; GENERIC-LABEL: test_vcvt_f16_f32:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_f16_f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_f16_f32:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d1, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d1
-; FAST-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; FAST-NEXT:    fcvt h0, s0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_f16_f32:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d1, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d1
+; CHECK-FI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FI-NEXT:    fcvt h0, s0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_f16_f32:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_f16_f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
   %elt = extractelement <1 x half> %tmp, i32 0
   ret half %elt
 }
 
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f32_f64)
 define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvt_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvt_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvt_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %cvt = fptrunc <2 x double> %v to <2 x float>
   %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %vcvt2.i
@@ -310,99 +240,80 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvtxn v0.2s, v0.2d
 ; CHECK-NEXT:    ret
-;
-; GISEL-LABEL: test_vcvtx_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    ret
   %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   ret <2 x float> %vcvtx1.i
 }
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvtx_high_f32_f64:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: test_vcvtx_high_f32_f64:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov d2, d0
-; FAST-NEXT:    // implicit-def: $q0
-; FAST-NEXT:    fmov d0, d2
-; FAST-NEXT:    fcvtxn2 v0.4s, v1.2d
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov d2, d0
+; CHECK-FI-NEXT:    // implicit-def: $q0
+; CHECK-FI-NEXT:    fmov d0, d2
+; CHECK-FI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: test_vcvtx_high_f32_f64:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    fcvtxn2 v0.4s, v1.2d
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-GI-NEXT:    ret
   %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %res
 }
 
-
-declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
 define i16 @to_half(float %in) {
-; GENERIC-LABEL: to_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fcvt h0, s0
-; GENERIC-NEXT:    fmov w0, s0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: to_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: to_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvt h1, s0
-; FAST-NEXT:    // implicit-def: $w0
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    fmov s0, s1
-; FAST-NEXT:    fmov w0, s0
-; FAST-NEXT:    // kill: def $w1 killed $w0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: to_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fcvt h1, s0
+; CHECK-FI-NEXT:    // implicit-def: $w0
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    fmov s0, s1
+; CHECK-FI-NEXT:    fmov w0, s0
+; CHECK-FI-NEXT:    // kill: def $w1 killed $w0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: to_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    fmov w0, s0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: to_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %res = call i16 @llvm.convert.to.fp16.f32(float %in)
   ret i16 %res
 }
 
 define float @from_half(i16 %in) {
-; GENERIC-LABEL: from_half:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    fmov s0, w0
-; GENERIC-NEXT:    fcvt s0, h0
-; GENERIC-NEXT:    ret
+; CHECK-SD-LABEL: from_half:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fcvt s0, h0
+; CHECK-SD-NEXT:    ret
 ;
-; FAST-LABEL: from_half:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fmov s0, w0
-; FAST-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; FAST-NEXT:    fcvt s0, h0
-; FAST-NEXT:    ret
+; CHECK-FI-LABEL: from_half:
+; CHECK-FI:       // %bb.0:
+; CHECK-FI-NEXT:    fmov s0, w0
+; CHECK-FI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-FI-NEXT:    fcvt s0, h0
+; CHECK-FI-NEXT:    ret
 ;
-; GISEL-LABEL: from_half:
-; GISEL:       // %bb.0:
-; GISEL-NEXT:    fmov s0, w0
-; GISEL-NEXT:    fcvt s0, h0
-; GISEL-NEXT:    ret
+; CHECK-GI-LABEL: from_half:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    ret
   %res = call float @llvm.convert.from.fp16.f32(i16 %in)
   ret float %res
 }
-
-declare float @llvm.convert.from.fp16.f32(i16) #1
-declare i16 @llvm.convert.to.fp16.f32(float) #1
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/peephole-csel.ll b/llvm/test/CodeGen/AArch64/peephole-csel.ll
index 868b9f1..b0852580 100644
--- a/llvm/test/CodeGen/AArch64/peephole-csel.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-csel.ll
@@ -5,10 +5,9 @@ define void @peephole_csel(ptr %dst, i1 %0, i1 %cmp) {
 ; CHECK-LABEL: peephole_csel:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tst w2, #0x1
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    tst w1, #0x1
-; CHECK-NEXT:    csel x8, x8, x9, eq
+; CHECK-NEXT:    csinc x8, x8, xzr, ne
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/rax1.ll b/llvm/test/CodeGen/AArch64/rax1.ll
index f679e90..4bb551f 100644
--- a/llvm/test/CodeGen/AArch64/rax1.ll
+++ b/llvm/test/CodeGen/AArch64/rax1.ll
@@ -1,22 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=SHA3,SHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=NOSHA3,NOSHA3-SD %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s -global-isel | FileCheck --check-prefixes=SHA3,SHA3-GI %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s -global-isel | FileCheck --check-prefixes=NOSHA3,NOSHA3-GI %s
 
 define <2 x i64> @rax1(<2 x i64> %x, <2 x i64> %y) {
-; SHA3-LABEL: rax1:
-; SHA3:       // %bb.0:
-; SHA3-NEXT:    rax1 v0.2d, v0.2d, v1.2d
-; SHA3-NEXT:    ret
+; SHA3-SD-LABEL: rax1:
+; SHA3-SD:       // %bb.0:
+; SHA3-SD-NEXT:    rax1 v0.2d, v0.2d, v1.2d
+; SHA3-SD-NEXT:    ret
 ;
-; NOSHA3-LABEL: rax1:
-; NOSHA3:       // %bb.0:
-; NOSHA3-NEXT:    add v2.2d, v1.2d, v1.2d
-; NOSHA3-NEXT:    usra v2.2d, v1.2d, #63
-; NOSHA3-NEXT:    eor v0.16b, v0.16b, v2.16b
-; NOSHA3-NEXT:    ret
+; NOSHA3-SD-LABEL: rax1:
+; NOSHA3-SD:       // %bb.0:
+; NOSHA3-SD-NEXT:    add v2.2d, v1.2d, v1.2d
+; NOSHA3-SD-NEXT:    usra v2.2d, v1.2d, #63
+; NOSHA3-SD-NEXT:    eor v0.16b, v0.16b, v2.16b
+; NOSHA3-SD-NEXT:    ret
+;
+; SHA3-GI-LABEL: rax1:
+; SHA3-GI:       // %bb.0:
+; SHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; SHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; SHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; SHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; SHA3-GI-NEXT:    ret
+;
+; NOSHA3-GI-LABEL: rax1:
+; NOSHA3-GI:       // %bb.0:
+; NOSHA3-GI-NEXT:    shl v2.2d, v1.2d, #1
+; NOSHA3-GI-NEXT:    ushr v1.2d, v1.2d, #63
+; NOSHA3-GI-NEXT:    orr v1.16b, v2.16b, v1.16b
+; NOSHA3-GI-NEXT:    eor v0.16b, v0.16b, v1.16b
+; NOSHA3-GI-NEXT:    ret
     %a = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %y, <2 x i64> %y, <2 x i64> <i64 1, i64 1>)
     %b = xor <2 x i64> %x, %a
     ret <2 x i64> %b
 }
 
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOSHA3: {{.*}}
+; SHA3: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
index c680f89..5b9f188 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
@@ -89,9 +89,9 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) {
 ; IR-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 [[TMP0]]
 ; IR-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVAR]], 1
 ; IR-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[OUT:%.*]], i64 [[TMP1]]
-; IR-NEXT:    [[VAL:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[UGLYGEP1]], i32 4, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> poison)
+; IR-NEXT:    [[VAL:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 4 [[UGLYGEP1]], <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> poison)
 ; IR-NEXT:    [[ADDP_VEC:%.*]] = add <vscale x 8 x i16> [[VAL]], splat (i16 3)
-; IR-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[ADDP_VEC]], ptr [[UGLYGEP]], i32 4, <vscale x 8 x i1> splat (i1 true))
+; IR-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[ADDP_VEC]], ptr align 4 [[UGLYGEP]], <vscale x 8 x i1> splat (i1 true))
 ; IR-NEXT:    [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALED_VF]]
 ; IR-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[N:%.*]], [[INDVAR_NEXT]]
 ; IR-NEXT:    br i1 [[EXIT_COND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
index 790da46..f91be1f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -115,19 +115,19 @@ define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef write
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[PTR_ADDR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_1]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_2]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
@@ -278,22 +278,22 @@ define void @gep_i32(i32 noundef %first, i32 noundef %N, ptr nocapture noundef w
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[PTR_ADDR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP2]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP5]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_1]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP8]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr align 1 [[ADD_PTR_2]], <vscale x 16 x i1> [[PG]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 002c03aa..e86f747 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -551,7 +551,9 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_release
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_release
@@ -562,6 +564,8 @@ define amdgpu_kernel void @workgroup_one_as_release() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_release
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") release
@@ -587,7 +591,9 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_acq_rel
@@ -599,6 +605,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_acq_rel
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") acq_rel
@@ -624,7 +632,9 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX10CU: bb.0.entry:
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 16240
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_one_as_seq_cst
@@ -636,6 +646,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_one_as_seq_cst
   ; GFX11CU: bb.0.entry:
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 1015
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup-one-as") seq_cst
@@ -1305,8 +1317,9 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_release
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_release
@@ -1317,7 +1330,8 @@ define amdgpu_kernel void @workgroup_release() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_release
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") release
@@ -1345,8 +1359,9 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_acq_rel
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_acq_rel
@@ -1358,7 +1373,8 @@ define amdgpu_kernel void @workgroup_acq_rel() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_acq_rel
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") acq_rel
@@ -1386,8 +1402,9 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ;
   ; GFX10CU-LABEL: name: workgroup_seq_cst
   ; GFX10CU: bb.0.entry:
-  ; GFX10CU-NEXT:   S_WAITCNT_soft 49279
+  ; GFX10CU-NEXT:   S_WAITCNT_soft 112
   ; GFX10CU-NEXT:   S_WAITCNT_lds_direct
+  ; GFX10CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX10CU-NEXT:   S_ENDPGM 0
   ;
   ; GFX11WGP-LABEL: name: workgroup_seq_cst
@@ -1399,7 +1416,8 @@ define amdgpu_kernel void @workgroup_seq_cst() #0 {
   ;
   ; GFX11CU-LABEL: name: workgroup_seq_cst
   ; GFX11CU: bb.0.entry:
-  ; GFX11CU-NEXT:   S_WAITCNT_soft 64519
+  ; GFX11CU-NEXT:   S_WAITCNT_soft 7
+  ; GFX11CU-NEXT:   S_WAITCNT_VSCNT_soft undef $sgpr_null, 0
   ; GFX11CU-NEXT:   S_ENDPGM 0
 entry:
   fence syncscope("workgroup") seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i32.ll b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
new file mode 100644
index 0000000..b53047f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs_i32.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600 %s
+
+define amdgpu_kernel void @abs_v1(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v1:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     MOV * T0.W, KC0[2].Z,
+; R600-NEXT:     SUB_INT * T1.W, 0.0, PV.W,
+; R600-NEXT:     MAX_INT T0.X, T0.W, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @abs_v2(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v2:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT:     MAX_INT T0.X, KC0[2].Z, PV.W,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %neg = sub i32 0, %arg
+  %cond = icmp sgt i32 %arg, %neg
+  %res = select i1 %cond, i32 %arg, i32 %neg
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @abs_v3(ptr addrspace(1) %out, i32 %arg) {
+; GFX9-LABEL: abs_v3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_abs_i32 s2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; R600-LABEL: abs_v3:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     SUB_INT * T0.W, 0.0, KC0[2].Z,
+; R600-NEXT:     MAX_INT T0.X, PV.W, KC0[2].Z,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %neg = sub i32 0, %arg
+  %cond = icmp sgt i32 %neg, %arg
+  %res = select i1 %cond, i32 %neg, i32 %arg
+  store i32 %res, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 53b2542..142290a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3611,10 +3611,10 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_load_ushort v6, off, s[16:19], 0 offset:4 glc
-; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v6
 ; SDAG-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -3627,12 +3627,12 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
-; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3652,6 +3652,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; SDAG-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    buffer_store_short v4, off, s[16:19], 0 offset:4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3671,6 +3672,7 @@ define void @volatile_store_v6i8(<6 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GISEL-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_store_short v2, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 35d178c..6f91222 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -17562,5 +17562,1363 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
    ret void
 }
 
+
+define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_zero_num:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_zero_num:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s2
+; CI-NEXT:    s_mov_b32 s5, s3
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; CI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; CI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
+; CI-NEXT:    s_mov_b32 s2, s6
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    v_mov_b32_e32 v2, v0
+; CI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; CI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; CI-NEXT:    v_mov_b32_e32 v3, s4
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_zero_num:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
+; VI-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    s_and_b64 s[0:1], vcc, exec
+; VI-NEXT:    s_cselect_b32 s0, 0x7ff80000, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_zero_num:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_zero_num:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX10-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_zero_num:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX11-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_zero_num:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1150-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1150-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX1150-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1150-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1150-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_zero_num:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1200-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1200-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1200-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
+; GFX1200-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1200-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+   %r1 = frem  <2 x double> <double 0.0, double 0.0>, %r0
+   store <2 x double> %r1, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const_one_denum(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: frem_v2f64_const_one_denum:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccz .LBB15_2
+; SI-NEXT:  ; %bb.1: ; %frem.else16
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB15_3
+; SI-NEXT:    s_branch .LBB15_8
+; SI-NEXT:  .LBB15_2:
+; SI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB15_3: ; %frem.compute15
+; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; SI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v6
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s5, s3, -1
+; SI-NEXT:    v_ldexp_f64 v[5:6], v[4:5], 26
+; SI-NEXT:    s_cmp_lt_i32 s5, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB15_7
+; SI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; SI-NEXT:    s_add_i32 s5, s3, 25
+; SI-NEXT:    v_mov_b32_e32 v9, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v4, 0
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_bfi_b32 v5, s4, v9, v8
+; SI-NEXT:    v_add_f64 v[10:11], v[7:8], v[4:5]
+; SI-NEXT:    v_add_f64 v[5:6], v[10:11], -v[4:5]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[7:8]|, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_add_f64 v[5:6], v[7:8], -v[5:6]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[5:6]
+; SI-NEXT:    v_add_f64 v[10:11], v[5:6], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; SI-NEXT:    v_ldexp_f64 v[5:6], v[5:6], 26
+; SI-NEXT:    s_sub_i32 s5, s5, 26
+; SI-NEXT:    s_cmp_gt_i32 s5, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB15_5
+; SI-NEXT:  ; %bb.6: ; %Flow50
+; SI-NEXT:    v_mov_b32_e32 v5, v7
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; SI-NEXT:    s_sub_i32 s2, s5, 25
+; SI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[4:5]|, s[2:3]
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x43300000
+; SI-NEXT:    v_bfi_b32 v7, s2, v6, v5
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    v_add_f64 v[8:9], v[4:5], v[6:7]
+; SI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v4, vcc
+; SI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; SI-NEXT:  .LBB15_8:
+; SI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccz .LBB15_10
+; SI-NEXT:  ; %bb.9: ; %frem.else
+; SI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; SI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; SI-NEXT:    s_mov_b64 vcc, exec
+; SI-NEXT:    s_cbranch_execz .LBB15_11
+; SI-NEXT:    s_branch .LBB15_16
+; SI-NEXT:  .LBB15_10:
+; SI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:  .LBB15_11: ; %frem.compute
+; SI-NEXT:    s_brev_b32 s4, -2
+; SI-NEXT:    v_and_b32_e32 v8, 0x7fffffff, v3
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; SI-NEXT:    v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[2:3]
+; SI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; SI-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
+; SI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s2, v8
+; SI-NEXT:    s_cselect_b32 s3, s2, 0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_add_i32 s5, s3, -1
+; SI-NEXT:    v_ldexp_f64 v[7:8], v[6:7], 26
+; SI-NEXT:    s_cmp_lt_i32 s5, 27
+; SI-NEXT:    s_cbranch_scc1 .LBB15_15
+; SI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; SI-NEXT:    s_add_i32 s5, s3, 25
+; SI-NEXT:    v_mov_b32_e32 v11, 0x43300000
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:  .LBB15_13: ; %frem.loop_body
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v10, v8
+; SI-NEXT:    v_mov_b32_e32 v9, v7
+; SI-NEXT:    v_bfi_b32 v7, s4, v11, v10
+; SI-NEXT:    v_add_f64 v[12:13], v[9:10], v[6:7]
+; SI-NEXT:    v_add_f64 v[7:8], v[12:13], -v[6:7]
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[9:10]|, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_add_f64 v[7:8], v[9:10], -v[7:8]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[7:8]
+; SI-NEXT:    v_add_f64 v[12:13], v[7:8], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; SI-NEXT:    v_ldexp_f64 v[7:8], v[7:8], 26
+; SI-NEXT:    s_sub_i32 s5, s5, 26
+; SI-NEXT:    s_cmp_gt_i32 s5, 26
+; SI-NEXT:    s_cbranch_scc1 .LBB15_13
+; SI-NEXT:  ; %bb.14: ; %Flow
+; SI-NEXT:    v_mov_b32_e32 v7, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v10
+; SI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; SI-NEXT:    s_sub_i32 s2, s5, 25
+; SI-NEXT:    v_ldexp_f64 v[6:7], v[7:8], s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s3, 0x432fffff
+; SI-NEXT:    v_cmp_gt_f64_e64 vcc, |v[6:7]|, s[2:3]
+; SI-NEXT:    s_brev_b32 s2, -2
+; SI-NEXT:    v_mov_b32_e32 v8, 0x43300000
+; SI-NEXT:    v_bfi_b32 v9, s2, v8, v7
+; SI-NEXT:    v_mov_b32_e32 v8, 0
+; SI-NEXT:    v_add_f64 v[10:11], v[6:7], v[8:9]
+; SI-NEXT:    v_add_f64 v[8:9], v[10:11], -v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v6, vcc
+; SI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; SI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; SI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; SI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; SI-NEXT:  .LBB15_16: ; %Flow49
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    s_mov_b32 s5, 0x7ff00000
+; SI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; SI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const_one_denum:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s2
+; CI-NEXT:    s_mov_b32 s5, s3
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB15_2
+; CI-NEXT:  ; %bb.1: ; %frem.else16
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; CI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB15_3
+; CI-NEXT:    s_branch .LBB15_8
+; CI-NEXT:  .LBB15_2:
+; CI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; CI-NEXT:  .LBB15_3: ; %frem.compute15
+; CI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_add_i32_e32 v8, vcc, -1, v6
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; CI-NEXT:    s_cbranch_vccnz .LBB15_7
+; CI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 25, v6
+; CI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v7, v5
+; CI-NEXT:    v_mov_b32_e32 v6, v4
+; CI-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; CI-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; CI-NEXT:    v_subrev_i32_e32 v8, vcc, 26, v8
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; CI-NEXT:    s_cbranch_vccnz .LBB15_5
+; CI-NEXT:  ; %bb.6: ; %Flow50
+; CI-NEXT:    v_mov_b32_e32 v4, v6
+; CI-NEXT:    v_mov_b32_e32 v5, v7
+; CI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; CI-NEXT:    v_subrev_i32_e32 v6, vcc, 25, v8
+; CI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; CI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; CI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; CI-NEXT:  .LBB15_8:
+; CI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_cbranch_vccz .LBB15_10
+; CI-NEXT:  ; %bb.9: ; %frem.else
+; CI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; CI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; CI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; CI-NEXT:    s_cbranch_execz .LBB15_11
+; CI-NEXT:    s_branch .LBB15_16
+; CI-NEXT:  .LBB15_10:
+; CI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; CI-NEXT:  .LBB15_11: ; %frem.compute
+; CI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; CI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_add_i32_e32 v10, vcc, -1, v8
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; CI-NEXT:    s_cbranch_vccnz .LBB15_15
+; CI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 25, v8
+; CI-NEXT:  .LBB15_13: ; %frem.loop_body
+; CI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CI-NEXT:    v_mov_b32_e32 v9, v7
+; CI-NEXT:    v_mov_b32_e32 v8, v6
+; CI-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; CI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; CI-NEXT:    v_subrev_i32_e32 v10, vcc, 26, v10
+; CI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; CI-NEXT:    s_cbranch_vccnz .LBB15_13
+; CI-NEXT:  ; %bb.14: ; %Flow
+; CI-NEXT:    v_mov_b32_e32 v6, v8
+; CI-NEXT:    v_mov_b32_e32 v7, v9
+; CI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; CI-NEXT:    v_subrev_i32_e32 v8, vcc, 25, v10
+; CI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; CI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; CI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; CI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; CI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; CI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; CI-NEXT:  .LBB15_16: ; %Flow49
+; CI-NEXT:    s_mov_b32 s4, 0
+; CI-NEXT:    s_mov_b32 s5, 0x7ff00000
+; CI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[4:5]
+; CI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; CI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[4:5]
+; CI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const_one_denum:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB15_2
+; VI-NEXT:  ; %bb.1: ; %frem.else16
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; VI-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB15_3
+; VI-NEXT:    s_branch .LBB15_8
+; VI-NEXT:  .LBB15_2:
+; VI-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; VI-NEXT:  .LBB15_3: ; %frem.compute15
+; VI-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_add_u32_e32 v8, vcc, -1, v6
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; VI-NEXT:    s_cbranch_vccnz .LBB15_7
+; VI-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; VI-NEXT:    v_add_u32_e32 v8, vcc, 25, v6
+; VI-NEXT:  .LBB15_5: ; %frem.loop_body23
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NEXT:    v_mov_b32_e32 v6, v4
+; VI-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; VI-NEXT:    v_subrev_u32_e32 v8, vcc, 26, v8
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; VI-NEXT:    s_cbranch_vccnz .LBB15_5
+; VI-NEXT:  ; %bb.6: ; %Flow50
+; VI-NEXT:    v_mov_b32_e32 v4, v6
+; VI-NEXT:    v_mov_b32_e32 v5, v7
+; VI-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; VI-NEXT:    v_subrev_u32_e32 v6, vcc, 25, v8
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; VI-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; VI-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; VI-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; VI-NEXT:  .LBB15_8:
+; VI-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; VI-NEXT:    s_cbranch_vccz .LBB15_10
+; VI-NEXT:  ; %bb.9: ; %frem.else
+; VI-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; VI-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; VI-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; VI-NEXT:    s_cbranch_execz .LBB15_11
+; VI-NEXT:    s_branch .LBB15_16
+; VI-NEXT:  .LBB15_10:
+; VI-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; VI-NEXT:  .LBB15_11: ; %frem.compute
+; VI-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; VI-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_add_u32_e32 v10, vcc, -1, v8
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; VI-NEXT:    s_cbranch_vccnz .LBB15_15
+; VI-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; VI-NEXT:    v_add_u32_e32 v10, vcc, 25, v8
+; VI-NEXT:  .LBB15_13: ; %frem.loop_body
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mov_b32_e32 v9, v7
+; VI-NEXT:    v_mov_b32_e32 v8, v6
+; VI-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; VI-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; VI-NEXT:    v_subrev_u32_e32 v10, vcc, 26, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; VI-NEXT:    s_cbranch_vccnz .LBB15_13
+; VI-NEXT:  ; %bb.14: ; %Flow
+; VI-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NEXT:    v_mov_b32_e32 v7, v9
+; VI-NEXT:  .LBB15_15: ; %frem.loop_exit
+; VI-NEXT:    v_subrev_u32_e32 v8, vcc, 25, v10
+; VI-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; VI-NEXT:    s_brev_b32 s2, -2
+; VI-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; VI-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; VI-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; VI-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; VI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; VI-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; VI-NEXT:  .LBB15_16: ; %Flow49
+; VI-NEXT:    s_mov_b32 s2, 0
+; VI-NEXT:    s_mov_b32 s3, 0x7ff00000
+; VI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; VI-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const_one_denum:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, 1.0
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX9-NEXT:  ; %bb.1: ; %frem.else16
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[0:1]|, 1.0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB15_3
+; GFX9-NEXT:    s_branch .LBB15_8
+; GFX9-NEXT:  .LBB15_2:
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX9-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_add_u32_e32 v8, -1, v6
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v8
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX9-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX9-NEXT:    v_add_u32_e32 v8, 25, v6
+; GFX9-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 26, v8
+; GFX9-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[9:10], v[4:5], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v8
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_5
+; GFX9-NEXT:  ; %bb.6: ; %Flow50
+; GFX9-NEXT:    v_mov_b32_e32 v4, v6
+; GFX9-NEXT:    v_mov_b32_e32 v5, v7
+; GFX9-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 25, v8
+; GFX9-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[4:5]
+; GFX9-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_bfi_b32 v5, s2, v5, v1
+; GFX9-NEXT:  .LBB15_8:
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, 1.0
+; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX9-NEXT:  ; %bb.9: ; %frem.else
+; GFX9-NEXT:    v_cmp_eq_f64_e64 vcc, |v[2:3]|, 1.0
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB15_11
+; GFX9-NEXT:    s_branch .LBB15_16
+; GFX9-NEXT:  .LBB15_10:
+; GFX9-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX9-NEXT:  .LBB15_11: ; %frem.compute
+; GFX9-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX9-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT:    v_add_u32_e32 v10, -1, v8
+; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 27, v10
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX9-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX9-NEXT:    v_add_u32_e32 v10, 25, v8
+; GFX9-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, v6
+; GFX9-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX9-NEXT:    v_subrev_u32_e32 v10, 26, v10
+; GFX9-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[11:12], v[6:7], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 26, v10
+; GFX9-NEXT:    s_cbranch_vccnz .LBB15_13
+; GFX9-NEXT:  ; %bb.14: ; %Flow
+; GFX9-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NEXT:    v_mov_b32_e32 v7, v9
+; GFX9-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX9-NEXT:    v_subrev_u32_e32 v8, 25, v10
+; GFX9-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX9-NEXT:    s_brev_b32 s2, -2
+; GFX9-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX9-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX9-NEXT:    v_bfi_b32 v7, s2, v7, v3
+; GFX9-NEXT:  .LBB15_16: ; %Flow49
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0x7ff00000
+; GFX9-NEXT:    v_cmp_nge_f64_e64 vcc, |v[0:1]|, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-NEXT:    v_cmp_nge_f64_e64 vcc, |v[2:3]|, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const_one_denum:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX10-NEXT:  ; %bb.1: ; %frem.else16
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB15_3
+; GFX10-NEXT:    s_branch .LBB15_8
+; GFX10-NEXT:  .LBB15_2:
+; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX10-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX10-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX10-NEXT:    s_add_i32 s2, s2, 25
+; GFX10-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX10-NEXT:  ; %bb.6: ; %Flow50
+; GFX10-NEXT:    v_mov_b32_e32 v4, v6
+; GFX10-NEXT:    v_mov_b32_e32 v8, s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX10-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX10-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX10-NEXT:  .LBB15_8:
+; GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX10-NEXT:  ; %bb.9: ; %frem.else
+; GFX10-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX10-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB15_11
+; GFX10-NEXT:    s_branch .LBB15_16
+; GFX10-NEXT:  .LBB15_10:
+; GFX10-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX10-NEXT:  .LBB15_11: ; %frem.compute
+; GFX10-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX10-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX10-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX10-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX10-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX10-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX10-NEXT:    s_add_i32 s2, s2, 25
+; GFX10-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v7
+; GFX10-NEXT:    v_mov_b32_e32 v8, v6
+; GFX10-NEXT:    s_sub_i32 s2, s2, 26
+; GFX10-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX10-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX10-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX10-NEXT:  ; %bb.14: ; %Flow
+; GFX10-NEXT:    v_mov_b32_e32 v6, v8
+; GFX10-NEXT:    v_mov_b32_e32 v10, s2
+; GFX10-NEXT:    v_mov_b32_e32 v7, v9
+; GFX10-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX10-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX10-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX10-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX10-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX10-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX10-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX10-NEXT:  .LBB15_16: ; %Flow49
+; GFX10-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7ff80000, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const_one_denum:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX11-NEXT:  ; %bb.1: ; %frem.else16
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX11-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB15_3
+; GFX11-NEXT:    s_branch .LBB15_8
+; GFX11-NEXT:  .LBB15_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX11-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX11-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX11-NEXT:    s_add_i32 s2, s2, 25
+; GFX11-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX11-NEXT:  ; %bb.6: ; %Flow50
+; GFX11-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX11-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX11-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX11-NEXT:  .LBB15_8:
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX11-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX11-NEXT:  ; %bb.9: ; %frem.else
+; GFX11-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX11-NEXT:    s_cbranch_execz .LBB15_11
+; GFX11-NEXT:    s_branch .LBB15_16
+; GFX11-NEXT:  .LBB15_10:
+; GFX11-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX11-NEXT:  .LBB15_11: ; %frem.compute
+; GFX11-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX11-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX11-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX11-NEXT:    s_add_i32 s2, s2, 25
+; GFX11-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT:    s_sub_i32 s2, s2, 26
+; GFX11-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX11-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX11-NEXT:  ; %bb.14: ; %Flow
+; GFX11-NEXT:    v_mov_b32_e32 v6, v8
+; GFX11-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX11-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX11-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX11-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX11-NEXT:  .LBB15_16: ; %Flow49
+; GFX11-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const_one_denum:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX1150-NEXT:  ; %bb.1: ; %frem.else16
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1150-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB15_3
+; GFX1150-NEXT:    s_branch .LBB15_8
+; GFX1150-NEXT:  .LBB15_2:
+; GFX1150-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1150-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX1150-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1150-NEXT:    s_add_i32 s2, s2, 25
+; GFX1150-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[6:7], -v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[8:9], v[4:5], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1150-NEXT:  ; %bb.6: ; %Flow50
+; GFX1150-NEXT:    v_mov_b32_e32 v4, v6
+; GFX1150-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1150-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1150-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[4:5], v[4:5], -v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[4:5], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1150-NEXT:  .LBB15_8:
+; GFX1150-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1150-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1150-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX1150-NEXT:  ; %bb.9: ; %frem.else
+; GFX1150-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1150-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1150-NEXT:    s_cbranch_execz .LBB15_11
+; GFX1150-NEXT:    s_branch .LBB15_16
+; GFX1150-NEXT:  .LBB15_10:
+; GFX1150-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1150-NEXT:  .LBB15_11: ; %frem.compute
+; GFX1150-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1150-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1150-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1150-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX1150-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX1150-NEXT:    s_add_i32 s2, s2, 25
+; GFX1150-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX1150-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1150-NEXT:    s_sub_i32 s2, s2, 26
+; GFX1150-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[8:9], -v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[10:11], v[6:7], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1150-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX1150-NEXT:  ; %bb.14: ; %Flow
+; GFX1150-NEXT:    v_mov_b32_e32 v6, v8
+; GFX1150-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1150-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1150-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[6:7], v[6:7], -v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1150-NEXT:    v_add_f64 v[8:9], v[6:7], 1.0
+; GFX1150-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1150-NEXT:  .LBB15_16: ; %Flow49
+; GFX1150-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1150-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1150-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1150-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1150-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1150-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const_one_denum:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_cmp_ngt_f64_e64 s2, |v[0:1]|, 1.0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT:    s_cbranch_vccz .LBB15_2
+; GFX1200-NEXT:  ; %bb.1: ; %frem.else16
+; GFX1200-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, 1.0
+; GFX1200-NEXT:    v_and_b32_e32 v4, 0x80000000, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB15_3
+; GFX1200-NEXT:    s_branch .LBB15_8
+; GFX1200-NEXT:  .LBB15_2:
+; GFX1200-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1200-NEXT:  .LBB15_3: ; %frem.compute15
+; GFX1200-NEXT:    v_frexp_mant_f64_e64 v[4:5], |v[0:1]|
+; GFX1200-NEXT:    v_frexp_exp_i32_f64_e32 v6, v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT:    v_add_nc_u32_e32 v8, -1, v6
+; GFX1200-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v8
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB15_7
+; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT:  .LBB15_5: ; %frem.loop_body23
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[4:5], v[6:7]
+; GFX1200-NEXT:    v_add_f64_e64 v[4:5], v[6:7], -v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT:    v_add_f64_e32 v[8:9], 1.0, v[4:5]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v5, v5, v9 :: v_dual_cndmask_b32 v4, v4, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 26
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB15_5
+; GFX1200-NEXT:  ; %bb.6: ; %Flow50
+; GFX1200-NEXT:    v_mov_b32_e32 v4, v6
+; GFX1200-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v5, v7
+; GFX1200-NEXT:  .LBB15_7: ; %frem.loop_exit24
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_subrev_nc_u32_e32 v6, 25, v8
+; GFX1200-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT:    v_add_f64_e64 v[4:5], v[4:5], -v[6:7]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[4:5]
+; GFX1200-NEXT:    v_add_f64_e32 v[6:7], 1.0, v[4:5]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v5, v5, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v1
+; GFX1200-NEXT:  .LBB15_8:
+; GFX1200-NEXT:    v_cmp_ngt_f64_e64 s2, |v[2:3]|, 1.0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cbranch_vccz .LBB15_10
+; GFX1200-NEXT:  ; %bb.9: ; %frem.else
+; GFX1200-NEXT:    v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, 1.0
+; GFX1200-NEXT:    v_and_b32_e32 v6, 0x80000000, v3
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v7, v3, v6, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e64 v6, v2, 0, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB15_11
+; GFX1200-NEXT:    s_branch .LBB15_16
+; GFX1200-NEXT:  .LBB15_10:
+; GFX1200-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX1200-NEXT:  .LBB15_11: ; %frem.compute
+; GFX1200-NEXT:    v_frexp_mant_f64_e64 v[6:7], |v[2:3]|
+; GFX1200-NEXT:    v_frexp_exp_i32_f64_e32 v8, v[2:3]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT:    v_add_nc_u32_e32 v10, -1, v8
+; GFX1200-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 27, v10
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB15_15
+; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_add_co_i32 s2, s2, 25
+; GFX1200-NEXT:  .LBB15_13: ; %frem.loop_body
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_sub_co_i32 s2, s2, 26
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_gt_i32 s2, 26
+; GFX1200-NEXT:    v_rndne_f64_e32 v[6:7], v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_f64_e64 v[6:7], v[8:9], -v[6:7]
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT:    v_add_f64_e32 v[10:11], 1.0, v[6:7]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_cndmask_b32 v7, v7, v11 :: v_dual_cndmask_b32 v6, v6, v10
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], 26
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB15_13
+; GFX1200-NEXT:  ; %bb.14: ; %Flow
+; GFX1200-NEXT:    v_mov_b32_e32 v6, v8
+; GFX1200-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v7, v9
+; GFX1200-NEXT:  .LBB15_15: ; %frem.loop_exit
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_subrev_nc_u32_e32 v8, 25, v10
+; GFX1200-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT:    v_add_f64_e64 v[6:7], v[6:7], -v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 0, v[6:7]
+; GFX1200-NEXT:    v_add_f64_e32 v[8:9], 1.0, v[6:7]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_cndmask_b32 v6, v6, v8 :: v_dual_cndmask_b32 v7, v7, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_bfi_b32 v7, 0x7fffffff, v7, v3
+; GFX1200-NEXT:  .LBB15_16: ; %Flow49
+; GFX1200-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, 0x7ff80000, v5, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
+; GFX1200-NEXT:    v_cmp_nle_f64_e64 vcc_lo, 0x7ff00000, |v[2:3]|
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_cndmask_b32 v3, 0x7ff80000, v7
+; GFX1200-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc_lo
+; GFX1200-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
+   %r1 = frem  <2 x double> %r0, <double 1.0, double 1.0>
+   store <2 x double> %r1, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
+; SI-LABEL: frem_v2f64_const:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v3, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; CI-LABEL: frem_v2f64_const:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; CI-NEXT:    v_mov_b32_e32 v2, v0
+; CI-NEXT:    v_mov_b32_e32 v3, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
+;
+; VI-LABEL: frem_v2f64_const:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    v_mov_b32_e32 v3, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: frem_v2f64_const:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: frem_v2f64_const:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v0, v[0:3], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: frem_v2f64_const:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64_const:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1150-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1150-NEXT:    v_mov_b32_e32 v3, v0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64_const:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1200-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
+   %r0 = frem  <2 x double> <double 1.0, double 1.0>, <double 2.0, double 1.0>
+   store <2 x double> %r0, ptr addrspace(1) %out, align 16
+   ret void
+}
+
+
+
 attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
index b91963f..d23509b 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll
@@ -150,7 +150,6 @@ define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc,
 ; GFX10CU-NEXT:    buffer_load_dword v0, s[8:11], 0 offen lds
 ; GFX10CU-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX10CU-NEXT:    s_waitcnt vmcnt(0)
-; GFX10CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10CU-NEXT:    s_barrier
 ; GFX10CU-NEXT:    ds_read_b32 v0, v0
 ; GFX10CU-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
index 5d03dfb..352af04 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.to.lds.ll
@@ -218,3 +218,174 @@ main_body:
   ret void
 }
 
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    s_mov_b32 m0, s0
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    s_mov_b32 m0, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 m0, s0
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:256 lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:256
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_volatile(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:256 lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 256, i32 0)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v0, s[0:1] glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v0, s[0:1] slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    global_load_lds_dword v0, s[0:1] nt
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+define amdgpu_ps void @buffer_load_lds_dword_nontemporal(ptr addrspace(7) nocapture inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0: ; %main_body
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0: ; %main_body
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    s_endpgm
+main_body:
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
index f0204bd..1dcd032 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.ll
@@ -110,3 +110,43 @@ main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 1, i32 0, i32 0, i32 2048, i32 0)
   ret void
 }
+
+define amdgpu_ps float @buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_volatile:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:256 lds
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 offset:512 lds
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 256, i32 0)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 512, i32 0)
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+define amdgpu_ps float @buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) {
+; GCN-LABEL: buffer_load_lds_dword_nontemporal:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_load_dword off, s[0:3], 0 glc slc lds
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    ds_read_b32 v0, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0), !nontemporal !0
+  %res = load float, ptr addrspace(3) %lds
+  ret float %res
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
index b5d741b..a979999 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.ll
@@ -80,25 +80,29 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(p
 ; PREGFX10-LABEL: buffer_load_volatile:
 ; PREGFX10:       ; %bb.0: ; %main_body
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
+; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc
 ; PREGFX10-NEXT:    s_waitcnt vmcnt(0)
 ; PREGFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: buffer_load_volatile:
 ; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: buffer_load_volatile:
 ; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e8b8d05..e8eccb0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -147,14 +147,13 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX8-OPT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8-OPT-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-OPT-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-OPT-NEXT:    s_barrier
-; GFX8-OPT-NEXT:    v_add_u32_e32 v1, vcc, v1, v1
-; GFX8-OPT-NEXT:    s_nop 1
-; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-OPT-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-OPT-NEXT:    v_add_u32_e32 v4, vcc, v1, v1
+; GFX8-OPT-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-OPT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-OPT-NEXT:    v_mov_b32_dpp v2, v4 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX8-OPT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-OPT-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-OPT-NEXT:    s_barrier
 ; GFX8-OPT-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-OPT-NEXT:    s_endpgm
 ;
@@ -194,14 +193,14 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    ds_read_b32 v1, v0
-; GFX10-NEXT:    s_barrier
-; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX10-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX10-NEXT:    s_barrier
+; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -213,15 +212,15 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
 ; GFX11-NEXT:    ds_load_b32 v1, v0
-; GFX11-NEXT:    s_barrier
-; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_dpp v2, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, v1, v1
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT:    s_barrier
+; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 97db15b..1d1d3e4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -107,6 +107,7 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -120,7 +121,6 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
 ; GFX12-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 9dac239..1e4b633 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -354,6 +354,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s10
@@ -365,8 +366,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX9-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -384,6 +385,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
@@ -395,8 +397,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX942-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -414,6 +416,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_or_b64 s[8:9], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-SDAG-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dword s13, s[4:5], 0x30
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-SDAG-NEXT:    s_mov_b32 s5, s12
@@ -425,8 +428,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -444,6 +447,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
 ; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
@@ -455,8 +459,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -475,6 +479,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
 ; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -488,8 +493,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_mov_b32 s2, s1
 ; GFX10-SDAG-NEXT:    s_mov_b32 s3, s10
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[10:11]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -508,6 +513,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
@@ -519,8 +525,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
+; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -541,6 +547,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -554,8 +561,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -576,6 +583,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -588,8 +596,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
+; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -610,6 +618,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
@@ -623,8 +632,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
@@ -645,6 +654,7 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
@@ -657,8 +667,8 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 516c3946..282a7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -15,7 +15,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -26,7 +25,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -38,7 +36,6 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -63,8 +60,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX10-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -77,8 +74,8 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX11-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -94,8 +91,10 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX12-CU-LABEL: test_s_barrier_workgroup_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
@@ -125,7 +124,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX10-CU:       ; %bb.0: ; %entry
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-CU-NEXT:    s_barrier
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -140,7 +138,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX11-CU:       ; %bb.0: ; %entry
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-CU-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX11-CU-NEXT:    s_barrier
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -160,7 +157,6 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-CU-NEXT:    s_wait_alu 0xffe3
 ; GFX12-CU-NEXT:    s_barrier_signal -1
 ; GFX12-CU-NEXT:    s_barrier_wait -1
 ; GFX12-CU-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 6a76f43..7efbff9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -107,6 +107,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -139,6 +141,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -151,6 +155,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -181,6 +189,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -216,6 +226,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -229,6 +241,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -259,6 +275,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -294,6 +312,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -307,6 +327,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -412,6 +436,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -444,6 +470,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -456,6 +484,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -486,6 +518,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -521,6 +555,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -534,6 +570,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -564,6 +604,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -599,6 +641,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -612,6 +656,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d288bfc..1cca64a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1093,7 +1093,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_release_fence:
@@ -1129,7 +1130,8 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_release_fence:
@@ -1142,7 +1144,10 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
@@ -1175,7 +1180,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_acq_rel_fence:
@@ -1214,7 +1220,8 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_acq_rel_fence:
@@ -1228,7 +1235,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
@@ -1261,7 +1271,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_seq_cst_fence:
@@ -1300,7 +1311,8 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_seq_cst_fence:
@@ -1314,7 +1326,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
@@ -1420,6 +1435,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence:
@@ -1452,6 +1469,8 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_release_fence:
@@ -1464,6 +1483,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_release_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
@@ -1494,6 +1517,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1529,6 +1554,8 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1542,6 +1569,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
@@ -1572,6 +1603,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1607,6 +1640,8 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX11-CU:       ; %bb.0: ; %entry
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: workgroup_one_as_seq_cst_fence:
@@ -1620,6 +1655,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 4caaad6..9e2906c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2942,8 +2938,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2964,8 +2958,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3196,8 +3188,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3218,8 +3208,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9001,8 +8989,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9027,8 +9013,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9349,8 +9333,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9375,8 +9357,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9677,8 +9657,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9699,8 +9677,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10335,8 +10311,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10361,8 +10335,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10683,8 +10655,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10709,8 +10679,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11031,8 +10999,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11057,8 +11023,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11379,8 +11343,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11405,8 +11367,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12071,8 +12031,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12097,8 +12055,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12419,8 +12375,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12445,8 +12399,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12767,8 +12719,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12793,8 +12743,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13632,8 +13580,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13654,8 +13600,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -15789,8 +15733,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -15812,8 +15754,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16054,8 +15994,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16077,8 +16015,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -21829,8 +21765,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21856,8 +21790,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22188,8 +22120,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22215,8 +22145,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22527,8 +22455,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22550,8 +22476,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23207,8 +23131,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23234,8 +23156,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23566,8 +23486,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23593,8 +23511,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23925,8 +23841,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23952,8 +23866,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24284,8 +24196,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24311,8 +24221,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24998,8 +24906,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25025,8 +24931,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25357,8 +25261,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25384,8 +25286,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25716,8 +25616,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25743,8 +25641,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
index 9ea9f11..27283be 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
@@ -795,8 +795,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -816,8 +814,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2937,8 +2933,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2959,8 +2953,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3190,8 +3182,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3212,8 +3202,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8982,8 +8970,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9008,8 +8994,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9329,8 +9313,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9355,8 +9337,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9656,8 +9636,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9678,8 +9656,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10314,8 +10290,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10340,8 +10314,6 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10661,8 +10633,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10687,8 +10657,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11008,8 +10976,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11034,8 +11000,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11355,8 +11319,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11381,8 +11343,6 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12045,8 +12005,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12071,8 +12029,6 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12392,8 +12348,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12418,8 +12372,6 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12739,8 +12691,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12765,8 +12715,6 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13603,8 +13551,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13625,8 +13571,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -15755,8 +15699,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -15778,8 +15720,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16019,8 +15959,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16042,8 +15980,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -21781,8 +21717,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21808,8 +21742,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22139,8 +22071,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22166,8 +22096,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22477,8 +22405,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22500,8 +22426,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23157,8 +23081,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23184,8 +23106,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23515,8 +23435,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23542,8 +23460,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23873,8 +23789,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23900,8 +23814,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24231,8 +24143,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24258,8 +24168,6 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24943,8 +24851,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24970,8 +24876,6 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25301,8 +25205,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25328,8 +25230,6 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25659,8 +25559,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25686,8 +25584,6 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 265c8c4..74065146 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -799,8 +799,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -820,8 +818,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2979,8 +2975,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3002,8 +2996,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3239,8 +3231,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3262,8 +3252,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9126,8 +9114,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9153,8 +9139,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9480,8 +9464,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9507,8 +9489,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9811,8 +9791,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9833,8 +9811,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10476,8 +10452,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10503,8 +10477,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10830,8 +10802,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10857,8 +10827,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11184,8 +11152,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11211,8 +11177,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -11538,8 +11502,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11565,8 +11527,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12242,8 +12202,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12269,8 +12227,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12596,8 +12552,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12623,8 +12577,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -12950,8 +12902,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -12977,8 +12927,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -13820,8 +13768,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13842,8 +13788,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16010,8 +15954,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16034,8 +15976,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -16281,8 +16221,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -16305,8 +16243,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22138,8 +22074,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22166,8 +22100,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22503,8 +22435,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22531,8 +22461,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -22845,8 +22773,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22868,8 +22794,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23532,8 +23456,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23560,8 +23482,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -23897,8 +23817,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -23925,8 +23843,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24262,8 +24178,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24290,8 +24204,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -24627,8 +24539,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -24655,8 +24565,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25353,8 +25261,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25381,8 +25287,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -25718,8 +25622,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -25746,8 +25648,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -26083,8 +25983,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -26111,8 +26009,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index d277441..2afa577 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1072,7 +1072,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1109,7 +1110,8 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1136,7 +1138,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index e7f348e..d384aec 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -656,12 +656,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -765,12 +765,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -786,8 +786,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -802,12 +800,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1195,7 +1195,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1280,7 +1281,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1307,7 +1309,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1374,7 +1379,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1459,7 +1465,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1486,7 +1493,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1893,7 +1903,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1978,7 +1989,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2005,7 +2017,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2076,9 +2091,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2172,9 +2189,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2202,9 +2221,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
@@ -2275,9 +2297,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2371,9 +2395,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2401,9 +2427,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
@@ -2699,12 +2728,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2815,12 +2844,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2837,8 +2866,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -2854,12 +2881,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2939,12 +2968,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3055,12 +3084,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3077,8 +3106,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -3094,12 +3121,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3737,7 +3766,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3860,7 +3890,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3895,7 +3926,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -4013,9 +4047,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4147,9 +4183,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4185,9 +4223,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4305,9 +4346,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4439,9 +4482,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4477,9 +4522,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
@@ -5143,9 +5191,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5277,9 +5327,11 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5315,9 +5367,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
@@ -5435,9 +5490,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5569,9 +5626,11 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5607,9 +5666,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
@@ -5727,9 +5789,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5861,9 +5925,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -5899,9 +5965,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
@@ -6019,9 +6088,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6153,9 +6224,11 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6191,9 +6264,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6929,7 +7005,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -7076,7 +7153,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7119,7 +7197,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -7251,12 +7332,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7405,12 +7486,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7431,8 +7512,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -7452,12 +7531,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -7585,12 +7666,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -7739,12 +7820,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -7765,8 +7846,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -7786,12 +7865,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8083,8 +8164,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -8547,12 +8626,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -8701,12 +8780,12 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -8727,8 +8806,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -8748,12 +8825,14 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -8881,12 +8960,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9035,12 +9114,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9061,8 +9140,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9082,12 +9159,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9215,12 +9294,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -9369,12 +9448,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -9395,8 +9474,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9416,12 +9493,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -9549,7 +9628,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9703,7 +9783,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9729,8 +9810,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -9750,7 +9829,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9883,7 +9965,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -10037,7 +10120,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10082,7 +10166,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -10215,12 +10302,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10369,12 +10456,12 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10395,8 +10482,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10416,12 +10501,14 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10549,12 +10636,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -10703,12 +10790,12 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -10729,8 +10816,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -10750,12 +10835,14 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -10883,12 +10970,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11037,12 +11124,12 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11063,8 +11150,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11084,12 +11169,14 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -11758,10 +11845,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_load_dword v2, v[0:1]
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -11860,10 +11950,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -11879,8 +11972,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_load_b32 v2, v[0:1] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -11896,10 +11987,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12286,6 +12382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12367,6 +12465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12393,6 +12493,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12458,6 +12562,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12539,6 +12645,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12565,6 +12673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12961,6 +13073,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13042,6 +13156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13068,6 +13184,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13135,7 +13255,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13222,7 +13345,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13250,7 +13376,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
@@ -13318,7 +13449,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v[0:1], v2
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13405,7 +13539,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13433,7 +13570,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v[0:1], v2
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
@@ -13724,10 +13866,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13833,10 +13978,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13853,8 +14001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -13871,10 +14017,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13953,10 +14104,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_swap v2, v[0:1], v2 glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14062,10 +14216,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14082,8 +14239,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -14100,10 +14255,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14731,6 +14891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14850,6 +15012,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -14884,6 +15048,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -14998,7 +15166,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15123,7 +15294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15159,7 +15333,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15274,7 +15453,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15399,7 +15581,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15435,7 +15620,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16078,7 +16268,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16203,7 +16396,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16239,7 +16435,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
@@ -16354,7 +16555,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16479,7 +16683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16515,7 +16722,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16630,7 +16842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16755,7 +16970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16791,7 +17009,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16906,6 +17129,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17031,6 +17256,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17067,6 +17294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17182,6 +17413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17307,6 +17540,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17343,6 +17578,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17458,7 +17697,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17583,7 +17825,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17619,7 +17864,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17734,7 +17984,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17859,7 +18112,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17895,7 +18151,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -18010,7 +18271,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v[0:1], v[2:3]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18135,7 +18399,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18171,7 +18438,12 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18902,6 +19174,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -19045,6 +19319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19087,6 +19363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -19217,10 +19497,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19364,10 +19647,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19388,8 +19674,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -19410,10 +19694,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -19540,10 +19829,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -19687,10 +19979,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -19711,8 +20006,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -19733,10 +20026,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20024,8 +20322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20483,10 +20779,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20630,10 +20929,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20654,8 +20956,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20676,10 +20976,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -20806,10 +21111,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -20953,10 +21261,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -20977,8 +21288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -20999,10 +21308,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21129,10 +21443,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -21276,10 +21593,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -21300,8 +21620,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21322,10 +21640,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -21452,6 +21775,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21599,6 +21924,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21623,8 +21950,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -21645,6 +21970,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21775,6 +22104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
@@ -21922,6 +22253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -21966,6 +22299,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
@@ -22096,10 +22433,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22243,10 +22583,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22267,8 +22610,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22289,10 +22630,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22419,10 +22765,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22566,10 +22915,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22590,8 +22942,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22612,10 +22962,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -22742,10 +23097,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s7
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -22889,10 +23247,13 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -22913,8 +23274,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s0
@@ -22935,10 +23294,15 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 5c2d8eb..493f985 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -804,8 +804,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -822,8 +820,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2984,8 +2980,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3003,8 +2997,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3229,8 +3221,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3248,8 +3238,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8606,8 +8594,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8629,8 +8615,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8923,8 +8907,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8946,8 +8928,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9219,8 +9199,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9238,8 +9216,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9814,8 +9790,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9837,8 +9811,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10131,8 +10103,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10154,8 +10124,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10448,8 +10416,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10471,8 +10437,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10765,8 +10729,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10788,8 +10750,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11395,8 +11355,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11418,8 +11376,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11712,8 +11668,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11735,8 +11689,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12029,8 +11981,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12052,8 +12002,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12878,8 +12826,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12896,8 +12842,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15058,8 +15002,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15077,8 +15019,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15303,8 +15243,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15322,8 +15260,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20384,8 +20320,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20407,8 +20341,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20701,8 +20633,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20724,8 +20654,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20997,8 +20925,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21016,8 +20942,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21592,8 +21516,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21615,8 +21537,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21909,8 +21829,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21932,8 +21850,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22226,8 +22142,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22249,8 +22163,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22543,8 +22455,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22566,8 +22476,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23173,8 +23081,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23196,8 +23102,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23490,8 +23394,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23513,8 +23415,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23807,8 +23707,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23830,8 +23728,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
index 7c0a2ad..40257df 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
@@ -804,8 +804,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -822,8 +820,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2979,8 +2975,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2998,8 +2992,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3223,8 +3215,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3242,8 +3232,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8587,8 +8575,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8610,8 +8596,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8903,8 +8887,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8926,8 +8908,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9198,8 +9178,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9217,8 +9195,6 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9793,8 +9769,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9816,8 +9790,6 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10109,8 +10081,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10132,8 +10102,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10425,8 +10393,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10448,8 +10414,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10741,8 +10705,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10764,8 +10726,6 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11369,8 +11329,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11392,8 +11350,6 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11685,8 +11641,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11708,8 +11662,6 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12001,8 +11953,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12024,8 +11974,6 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12849,8 +12797,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12867,8 +12813,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15024,8 +14968,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15043,8 +14985,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15268,8 +15208,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -15287,8 +15225,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20337,8 +20273,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20360,8 +20294,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20653,8 +20585,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20676,8 +20606,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20948,8 +20876,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20967,8 +20893,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21543,8 +21467,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21566,8 +21488,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21859,8 +21779,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21882,8 +21800,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22175,8 +22091,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22198,8 +22112,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22491,8 +22403,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22514,8 +22424,6 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23119,8 +23027,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23142,8 +23048,6 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23435,8 +23339,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23458,8 +23360,6 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23751,8 +23651,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -23774,8 +23672,6 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index e7880a8..ee5a8bf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -808,8 +808,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -826,8 +824,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3021,8 +3017,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3041,8 +3035,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3272,8 +3264,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3292,8 +3282,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7235,8 +7223,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7259,8 +7245,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7558,8 +7542,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7582,8 +7564,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7857,8 +7837,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7876,8 +7854,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8459,8 +8435,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8483,8 +8457,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8782,8 +8754,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8806,8 +8776,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9105,8 +9073,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9129,8 +9095,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9428,8 +9392,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9452,8 +9414,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10070,8 +10030,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10094,8 +10052,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10393,8 +10349,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10417,8 +10371,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10716,8 +10668,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10740,8 +10690,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11570,8 +11518,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11588,8 +11534,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -13783,8 +13727,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -13803,8 +13745,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14034,8 +13974,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14054,8 +13992,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19493,8 +19429,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19517,8 +19451,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19816,8 +19748,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19840,8 +19770,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20115,8 +20043,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20134,8 +20060,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20717,8 +20641,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20741,8 +20663,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21040,8 +20960,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21064,8 +20982,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21363,8 +21279,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21387,8 +21301,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21686,8 +21598,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21710,8 +21620,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22328,8 +22236,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22352,8 +22258,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22651,8 +22555,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22675,8 +22577,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22974,8 +22874,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22998,8 +22896,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 3bf5ed8..c326edf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -959,7 +959,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1001,7 +1002,8 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1026,7 +1028,10 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 885edec..868b438 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -667,7 +667,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -763,7 +764,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -780,8 +782,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -792,7 +792,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -1206,7 +1209,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1292,7 +1296,8 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1317,7 +1322,10 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1393,7 +1401,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1479,7 +1488,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1504,7 +1514,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1920,7 +1933,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -2005,7 +2019,8 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -2030,7 +2045,10 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -2107,8 +2125,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2198,8 +2218,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2225,8 +2247,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
@@ -2303,8 +2329,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2394,8 +2422,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2421,8 +2451,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
@@ -2707,7 +2741,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -2809,7 +2844,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2827,8 +2863,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2841,7 +2875,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -2930,7 +2967,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -3032,7 +3070,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3050,8 +3089,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3064,7 +3101,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -3650,7 +3690,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3764,7 +3805,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3797,7 +3839,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3906,8 +3951,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4026,8 +4073,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4061,8 +4110,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
@@ -4171,8 +4224,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4291,8 +4346,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4326,8 +4383,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
@@ -4926,8 +4987,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5046,8 +5109,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5081,8 +5146,12 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
@@ -5191,8 +5260,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5311,8 +5382,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5346,8 +5419,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
@@ -5456,8 +5533,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5576,8 +5655,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5611,8 +5692,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
@@ -5721,7 +5806,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -5841,7 +5927,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -5876,7 +5963,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -5986,7 +6076,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -6106,7 +6197,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -6141,7 +6233,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -6251,8 +6346,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6371,8 +6468,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6406,8 +6505,12 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
@@ -6516,8 +6619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6636,8 +6741,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6671,8 +6778,12 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
@@ -6781,8 +6892,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6901,8 +7014,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -6936,8 +7051,12 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
@@ -7594,7 +7713,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -7723,7 +7843,8 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7760,7 +7881,10 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -7883,7 +8007,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8015,7 +8140,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8037,8 +8163,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8055,7 +8179,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8178,7 +8305,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -8310,7 +8438,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8332,8 +8461,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8350,7 +8477,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -8610,8 +8740,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9021,7 +9149,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9153,7 +9282,8 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9175,8 +9305,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9193,7 +9321,10 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9316,7 +9447,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9448,7 +9580,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9470,8 +9603,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9488,7 +9619,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9611,7 +9745,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -9743,7 +9878,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9765,8 +9901,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9783,7 +9917,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -9906,7 +10043,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10038,7 +10176,8 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10060,8 +10199,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10078,7 +10215,10 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10201,7 +10341,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10333,7 +10474,8 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10371,7 +10513,10 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10494,7 +10639,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10626,7 +10772,8 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10648,8 +10795,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10666,7 +10811,10 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10789,7 +10937,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -10921,7 +11070,8 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10943,8 +11093,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -10961,7 +11109,10 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11084,7 +11235,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -11216,7 +11368,8 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11238,8 +11391,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11256,7 +11407,10 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -11940,7 +12094,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -12035,7 +12190,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12052,8 +12208,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-WGP-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -12064,6 +12218,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
@@ -12475,6 +12633,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12557,6 +12717,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12581,6 +12743,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -12654,6 +12820,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -12736,6 +12904,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -12760,6 +12930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13173,6 +13347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -13254,6 +13430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -13278,6 +13456,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -13352,7 +13534,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13439,7 +13624,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13465,7 +13653,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
@@ -13540,7 +13733,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v0, v1, s[4:5]
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13627,7 +13823,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13653,7 +13852,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
@@ -13936,6 +14140,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14034,6 +14240,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14051,8 +14259,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14065,6 +14271,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14151,6 +14361,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_swap v1, v0, v1, s[4:5] glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -14249,6 +14461,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14266,8 +14480,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14280,6 +14492,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -14863,6 +15079,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -14973,6 +15191,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -15005,6 +15225,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -15111,7 +15335,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15227,7 +15454,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15261,7 +15491,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
@@ -15368,7 +15603,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15484,7 +15722,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -15518,7 +15759,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
@@ -16115,7 +16361,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16231,7 +16480,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16265,7 +16517,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
@@ -16372,7 +16629,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16488,7 +16748,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16522,7 +16785,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
@@ -16629,7 +16897,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16745,7 +17016,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16779,7 +17053,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
@@ -16886,6 +17165,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17002,6 +17283,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17036,6 +17319,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17143,6 +17430,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -17259,6 +17548,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -17293,6 +17584,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -17400,7 +17695,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17516,7 +17814,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17550,7 +17851,12 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
@@ -17657,7 +17963,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17773,7 +18082,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17807,7 +18119,12 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
@@ -17914,7 +18231,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    s_endpgm
 ;
 ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18030,7 +18350,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    s_endpgm
 ;
 ; GFX12-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18064,7 +18387,12 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
@@ -18719,6 +19047,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -18844,6 +19174,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -18880,6 +19212,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19000,6 +19336,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19128,6 +19466,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19149,8 +19489,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19167,6 +19505,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19287,6 +19629,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -19415,6 +19759,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19436,8 +19782,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19454,6 +19798,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -19713,8 +20061,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ; GFX12-WGP-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-WGP-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20122,6 +20468,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20250,6 +20598,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20271,8 +20621,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20289,6 +20637,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20409,6 +20761,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20537,6 +20891,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20558,8 +20914,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20576,6 +20930,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20696,6 +21054,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -20824,6 +21184,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20845,8 +21207,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20863,6 +21223,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -20983,6 +21347,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21111,6 +21477,8 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21132,8 +21500,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21150,6 +21516,10 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21270,6 +21640,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21398,6 +21770,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21435,6 +21809,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21555,6 +21933,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21683,6 +22063,8 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21704,8 +22086,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21722,6 +22102,10 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21842,6 +22226,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -21970,6 +22356,8 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -21991,8 +22379,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22009,6 +22395,10 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22129,6 +22519,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX10-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
 ; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
@@ -22257,6 +22649,8 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
 ; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22278,8 +22672,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_storecnt 0x0
 ; GFX12-WGP-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
-; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-WGP-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -22296,6 +22688,10 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX12-CU-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
 ; GFX12-CU-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN
 ; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
new file mode 100644
index 0000000..0b40c81
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-lds-dma-volatile-and-nontemporal.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -global-isel=0 < %s | FileCheck --check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=0 < %s | FileCheck --check-prefixes=GFX942 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -global-isel=1 < %s | FileCheck --check-prefixes=GFX942-GISEL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -global-isel=0 < %s | FileCheck --check-prefixes=GFX10 %s
+
+define amdgpu_ps void @global_load_lds_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @global_load_lds_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: global_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc slc lds
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off nt
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off slc lds
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p1_dword_volatile_nontemporal(ptr addrspace(1) inreg %gptr, i64 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s1
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX90A-NEXT:    s_mov_b32 m0, s2
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_load_dword v[0:1], off glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_mov_b32 m0, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off sc0 sc1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_lds_dword v[0:1], off offset:512
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p1_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 m0, s2
+; GFX10-NEXT:    global_load_dword v[0:1], off glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_load_dword v[0:1], off offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(1) %gptr, i64 %off
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @load_to_lds_p7_dword_volatile_nontemporal(ptr addrspace(7) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX90A-NEXT:    s_mov_b32 m0, s5
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_mov_b32 m0, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: load_to_lds_p7_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s4, v0
+; GFX10-NEXT:    s_mov_b32 m0, s5
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  %gptr.off = getelementptr i8, ptr addrspace(7) %gptr, i32 %off
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) %gptr.off, ptr addrspace(3) %lptr, i32 4, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648)
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc slc lds
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen nt lds
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen slc lds
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 0), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_ptr_buffer_load_lds_dword_volatile_nontemporal(ptr addrspace(8) inreg %gptr, i32 %off, ptr addrspace(3) inreg %lptr) {
+; GFX90A-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_mov_b32 m0, s4
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc lds
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 m0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen sc0 sc1 lds
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX942-GISEL-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_ptr_buffer_load_lds_dword_volatile_nontemporal:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_mov_b32 m0, s4
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen glc dlc lds
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v0, s[0:3], 0 offen offset:512 lds
+; GFX10-NEXT:    s_endpgm
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 0, i32 2147483648), !nontemporal !0
+  call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %gptr, ptr addrspace(3) %lptr, i32 4, i32 %off, i32 0, i32 512, i32 0)
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 986b48b..712109d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_agent_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 8926893..6d1e4e6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_cluster_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 81bbe0a..577d2ca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_system_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 980141a..d686e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -819,7 +819,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -854,7 +855,8 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -879,7 +881,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 6a233a2..ab4d783 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -622,7 +622,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_read_b32 v1, v0
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -719,7 +720,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,7 +751,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_load_b32 v1, v0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -1121,7 +1126,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1200,7 +1206,8 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1225,7 +1232,10 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1291,7 +1301,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_write_b32 v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1370,7 +1381,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1395,7 +1407,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_store_b32 v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1778,7 +1793,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -1857,7 +1873,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -1882,7 +1899,10 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -1952,7 +1972,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2039,7 +2060,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2067,7 +2089,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2139,7 +2164,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v0, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -2226,7 +2252,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -2254,7 +2281,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v0, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -2535,7 +2565,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2639,7 +2670,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2671,7 +2703,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2756,7 +2791,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_wrxchg_rtn_b32 v1, v0, v1
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -2860,7 +2896,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -2892,7 +2929,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_storexchg_rtn_b32 v1, v0, v1
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -3348,7 +3388,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_endpgm
 ;
@@ -3441,7 +3482,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_endpgm
 ;
@@ -3470,7 +3512,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_endpgm
 ;
@@ -3551,7 +3596,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3652,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3684,7 +3731,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -3767,7 +3817,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -3868,7 +3919,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -3900,7 +3952,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4375,7 +4430,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4476,7 +4532,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4508,7 +4565,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4591,7 +4651,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4692,7 +4753,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4724,7 +4786,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -4807,7 +4872,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -4908,7 +4974,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -4940,7 +5007,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5023,7 +5093,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5124,7 +5195,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5156,7 +5228,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5239,7 +5314,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5340,7 +5416,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5372,7 +5449,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5455,7 +5535,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5556,7 +5637,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5588,7 +5670,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5671,7 +5756,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5772,7 +5858,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -5804,7 +5891,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -5887,7 +5977,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_b32 v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    s_endpgm
@@ -5988,7 +6079,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    s_endpgm
@@ -6020,7 +6112,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_b32 v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    s_endpgm
@@ -6567,7 +6662,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6682,7 +6778,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6717,7 +6814,10 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
@@ -6814,7 +6914,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -6932,7 +7033,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -6968,7 +7070,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7065,7 +7170,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7183,7 +7289,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7219,7 +7326,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7778,7 +7888,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -7896,7 +8007,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -7932,7 +8044,10 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8029,7 +8144,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8147,7 +8263,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8183,7 +8300,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8280,7 +8400,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8398,7 +8519,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8434,7 +8556,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8531,7 +8656,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8649,7 +8775,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8685,7 +8812,10 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8782,7 +8912,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -8900,7 +9031,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -8936,7 +9068,10 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9033,7 +9168,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9151,7 +9287,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9187,7 +9324,10 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9284,7 +9424,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9402,7 +9543,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9438,7 +9580,10 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9535,7 +9680,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-CU-NEXT:    ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
@@ -9653,7 +9799,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
@@ -9689,7 +9836,10 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_storecnt 0x0
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX12-CU-NEXT:    ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
 ; GFX12-CU-NEXT:    s_wait_dscnt 0x0
 ; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
new file mode 100644
index 0000000..93f7bcc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency.mir
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes=postmisched -o - %s | FileCheck %s
+
+# Ensure WMMA operations stay before the final atomic fence and barrier group.
+# This allows the latency of the WMMA operations to be hidden by barrier wait.
+---
+name: test
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+
+    ; CHECK-LABEL: name: test
+    ; CHECK: liveins: $sgpr0, $sgpr12, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7_vgpr8, $vgpr9_vgpr10_vgpr11_vgpr12, $vgpr13_vgpr14_vgpr15_vgpr16, $vgpr17_vgpr18_vgpr19_vgpr20, $vgpr21_vgpr22_vgpr23_vgpr24, $vgpr25_vgpr26_vgpr27_vgpr28, $vgpr29_vgpr30_vgpr31_vgpr32
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit killed $vgpr36, implicit $exec, implicit killed $vgpr37, implicit killed $vgpr38 {
+    ; CHECK-NEXT:   $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 killed $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 killed $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT:   $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 killed $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    ; CHECK-NEXT: $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    ; CHECK-NEXT: $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    ; CHECK-NEXT: S_CMP_LT_U32 killed $sgpr0, killed $sgpr12, implicit-def $scc
+    ; CHECK-NEXT: $sgpr0 = S_MOV_B32 killed $sgpr1
+    ; CHECK-NEXT: early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    ; CHECK-NEXT: early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    ; CHECK-NEXT: ATOMIC_FENCE 5, 2
+    ; CHECK-NEXT: S_BARRIER
+    ; CHECK-NEXT: ATOMIC_FENCE 4, 2
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+    BUNDLE implicit-def $vgpr41_vgpr42_vgpr43_vgpr44, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr41_vgpr42, implicit-def $vgpr41_vgpr42_vgpr43, implicit-def $vgpr42_vgpr43, implicit-def $vgpr42_vgpr43_vgpr44, implicit-def $vgpr43_vgpr44, implicit-def $vgpr45_vgpr46_vgpr47_vgpr48, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr45_vgpr46, implicit-def $vgpr45_vgpr46_vgpr47, implicit-def $vgpr46_vgpr47, implicit-def $vgpr46_vgpr47_vgpr48, implicit-def $vgpr47_vgpr48, implicit-def $vgpr49_vgpr50_vgpr51_vgpr52, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr49_vgpr50, implicit-def $vgpr49_vgpr50_vgpr51, implicit-def $vgpr50_vgpr51, implicit-def $vgpr50_vgpr51_vgpr52, implicit-def $vgpr51_vgpr52, implicit-def $vgpr53_vgpr54_vgpr55_vgpr56, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr53_vgpr54, implicit-def $vgpr53_vgpr54_vgpr55, implicit-def $vgpr54_vgpr55, implicit-def $vgpr54_vgpr55_vgpr56, implicit-def $vgpr55_vgpr56, implicit-def $vgpr57_vgpr58_vgpr59_vgpr60, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr57_vgpr58, implicit-def $vgpr57_vgpr58_vgpr59, implicit-def $vgpr58_vgpr59, implicit-def $vgpr58_vgpr59_vgpr60, implicit-def $vgpr59_vgpr60, implicit-def $vgpr61_vgpr62_vgpr63_vgpr64, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit-def $vgpr61_vgpr62, implicit-def $vgpr61_vgpr62_vgpr63, implicit-def $vgpr62_vgpr63, implicit-def $vgpr62_vgpr63_vgpr64, implicit-def $vgpr63_vgpr64, implicit-def $vgpr65_vgpr66_vgpr67_vgpr68, implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr65_vgpr66, implicit-def $vgpr65_vgpr66_vgpr67, implicit-def $vgpr66_vgpr67, implicit-def $vgpr66_vgpr67_vgpr68, implicit-def $vgpr67_vgpr68, implicit-def $vgpr69_vgpr70_vgpr71_vgpr72, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr69_vgpr70, implicit-def $vgpr69_vgpr70_vgpr71, implicit-def $vgpr70_vgpr71, implicit-def $vgpr70_vgpr71_vgpr72, implicit-def $vgpr71_vgpr72, implicit-def $vgpr73_vgpr74_vgpr75_vgpr76, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr73_vgpr74, implicit-def $vgpr73_vgpr74_vgpr75, implicit-def $vgpr74_vgpr75, implicit-def $vgpr74_vgpr75_vgpr76, implicit-def $vgpr75_vgpr76, implicit-def $vgpr77_vgpr78_vgpr79_vgpr80, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit-def $vgpr77_vgpr78, implicit-def $vgpr77_vgpr78_vgpr79, implicit-def $vgpr78_vgpr79, implicit-def $vgpr78_vgpr79_vgpr80, implicit-def $vgpr79_vgpr80, implicit-def $vgpr81_vgpr82_vgpr83_vgpr84, implicit-def $vgpr81, implicit-def $vgpr81_lo16, implicit-def $vgpr81_hi16, implicit-def $vgpr82, implicit-def $vgpr82_lo16, implicit-def $vgpr82_hi16, implicit-def $vgpr83, implicit-def $vgpr83_lo16, implicit-def $vgpr83_hi16, implicit-def $vgpr84, implicit-def $vgpr84_lo16, implicit-def $vgpr84_hi16, implicit-def $vgpr81_vgpr82, implicit-def $vgpr81_vgpr82_vgpr83, implicit-def $vgpr82_vgpr83, implicit-def $vgpr82_vgpr83_vgpr84, implicit-def $vgpr83_vgpr84, implicit-def $vgpr85_vgpr86_vgpr87_vgpr88, implicit-def $vgpr85, implicit-def $vgpr85_lo16, implicit-def $vgpr85_hi16, implicit-def $vgpr86, implicit-def $vgpr86_lo16, implicit-def $vgpr86_hi16, implicit-def $vgpr87, implicit-def $vgpr87_lo16, implicit-def $vgpr87_hi16, implicit-def $vgpr88, implicit-def $vgpr88_lo16, implicit-def $vgpr88_hi16, implicit-def $vgpr85_vgpr86, implicit-def $vgpr85_vgpr86_vgpr87, implicit-def $vgpr86_vgpr87, implicit-def $vgpr86_vgpr87_vgpr88, implicit-def $vgpr87_vgpr88, implicit $vgpr36, implicit $exec, implicit $vgpr37, implicit $vgpr38 {
+      $vgpr41_vgpr42_vgpr43_vgpr44 = DS_READ_B128_gfx9 $vgpr36, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr45_vgpr46_vgpr47_vgpr48 = DS_READ2_B64_gfx9 $vgpr36, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr49_vgpr50_vgpr51_vgpr52 = DS_READ_B128_gfx9 $vgpr37, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr53_vgpr54_vgpr55_vgpr56 = DS_READ2_B64_gfx9 $vgpr37, 2, 3, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr57_vgpr58_vgpr59_vgpr60 = DS_READ_B128_gfx9 $vgpr37, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr61_vgpr62_vgpr63_vgpr64 = DS_READ2_B64_gfx9 $vgpr37, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr65_vgpr66_vgpr67_vgpr68 = DS_READ_B128_gfx9 $vgpr36, 768, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr69_vgpr70_vgpr71_vgpr72 = DS_READ2_B64_gfx9 $vgpr36, 98, 99, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr73_vgpr74_vgpr75_vgpr76 = DS_READ_B128_gfx9 $vgpr36, 1536, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr77_vgpr78_vgpr79_vgpr80 = DS_READ2_B64_gfx9 $vgpr36, 194, 195, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr81_vgpr82_vgpr83_vgpr84 = DS_READ_B128_gfx9 $vgpr36, 2304, 0, implicit $exec :: (load (s128), addrspace 3)
+      $vgpr85_vgpr86_vgpr87_vgpr88 = DS_READ2_B64_gfx9 $vgpr38, 0, 1, 0, implicit $exec :: (load (s128), addrspace 3)
+    }
+    $sgpr1 = S_ADD_I32 $sgpr0, 16, implicit-def dead $scc
+    $vgpr39 = V_ADD_U32_e32 32, killed $vgpr39, implicit $exec
+    $vgpr40 = V_ADD_U32_e32 32, killed $vgpr40, implicit $exec
+    S_CMP_LT_U32 killed $sgpr0, $sgpr12, implicit-def $scc
+    early-clobber $vgpr29_vgpr30_vgpr31_vgpr32 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr29_vgpr30_vgpr31_vgpr32, 0, 0, implicit $exec
+    early-clobber $vgpr25_vgpr26_vgpr27_vgpr28 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr25_vgpr26_vgpr27_vgpr28, 0, 0, implicit $exec
+    early-clobber $vgpr21_vgpr22_vgpr23_vgpr24 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr21_vgpr22_vgpr23_vgpr24, 0, 0, implicit $exec
+    early-clobber $vgpr17_vgpr18_vgpr19_vgpr20 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr17_vgpr18_vgpr19_vgpr20, 0, 0, implicit $exec
+    early-clobber $vgpr13_vgpr14_vgpr15_vgpr16 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr13_vgpr14_vgpr15_vgpr16, 0, 0, implicit $exec
+    early-clobber $vgpr9_vgpr10_vgpr11_vgpr12 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80, 8, $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr9_vgpr10_vgpr11_vgpr12, 0, 0, implicit $exec
+    early-clobber $vgpr5_vgpr6_vgpr7_vgpr8 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56, 8, killed $vgpr5_vgpr6_vgpr7_vgpr8, 0, 0, implicit $exec
+    early-clobber $vgpr1_vgpr2_vgpr3_vgpr4 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed $vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88, 8, killed $vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64, 8, killed $vgpr1_vgpr2_vgpr3_vgpr4, 0, 0, implicit $exec
+    $sgpr0 = S_MOV_B32 killed $sgpr1
+    ATOMIC_FENCE 5, 2
+    S_BARRIER
+    ATOMIC_FENCE 4, 2
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index f3cb5a7..30f5277 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
 ; GFX9-LABEL: barrier_vmcnt_global:
 ; GFX9:         s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v1, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    global_load_dword v3, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_barrier
-; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    global_store_dword v[0:1], v3, off
 ; GFX9-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,10 +369,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX8-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_barrier
 ; GFX8-NEXT:    flat_store_dword v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
@@ -393,10 +392,9 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
 ; GFX9-NEXT:    flat_load_dword v3, v[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 30, v[1:2]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_barrier
 ; GFX9-NEXT:    flat_store_dword v[0:1], v3
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index e3437fd..a42c8ac7 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -767,6 +767,136 @@ define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) {
   ret void
 }
 
+define amdgpu_gfx_whole_wave void @realign_stack(i1 %active, i32 %x) {
+; DAGISEL-LABEL: realign_stack:
+; DAGISEL:       ; %bb.0:
+; DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT:    s_wait_expcnt 0x0
+; DAGISEL-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL-NEXT:    s_mov_b32 s1, s33
+; DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT:    s_mov_b32 s2, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s32
+; DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL-NEXT:    s_wait_storecnt 0x0
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_mov_b32 s32, s34
+; DAGISEL-NEXT:    s_mov_b32 s34, s2
+; DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; DAGISEL-NEXT:    s_mov_b32 s33, s1
+; DAGISEL-NEXT:    s_wait_alu 0xfffe
+; DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: realign_stack:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    s_wait_expcnt 0x0
+; GISEL-NEXT:    s_wait_samplecnt 0x0
+; GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    s_mov_b32 s1, s33
+; GISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GISEL-NEXT:    s_mov_b32 s2, s34
+; GISEL-NEXT:    s_mov_b32 s34, s32
+; GISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL-NEXT:    s_wait_storecnt 0x0
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_mov_b32 s32, s34
+; GISEL-NEXT:    s_mov_b32 s34, s2
+; GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GISEL-NEXT:    s_mov_b32 s33, s1
+; GISEL-NEXT:    s_wait_alu 0xfffe
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAGISEL64-LABEL: realign_stack:
+; DAGISEL64:       ; %bb.0:
+; DAGISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; DAGISEL64-NEXT:    s_wait_expcnt 0x0
+; DAGISEL64-NEXT:    s_wait_samplecnt 0x0
+; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
+; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
+; DAGISEL64-NEXT:    s_mov_b32 s2, s33
+; DAGISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; DAGISEL64-NEXT:    s_mov_b32 s3, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s32
+; DAGISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; DAGISEL64-NEXT:    s_wait_storecnt 0x0
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_mov_b32 s32, s34
+; DAGISEL64-NEXT:    s_mov_b32 s34, s3
+; DAGISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; DAGISEL64-NEXT:    s_mov_b32 s33, s2
+; DAGISEL64-NEXT:    s_wait_alu 0xfffe
+; DAGISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL64-LABEL: realign_stack:
+; GISEL64:       ; %bb.0:
+; GISEL64-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL64-NEXT:    s_wait_expcnt 0x0
+; GISEL64-NEXT:    s_wait_samplecnt 0x0
+; GISEL64-NEXT:    s_wait_bvhcnt 0x0
+; GISEL64-NEXT:    s_wait_kmcnt 0x0
+; GISEL64-NEXT:    s_mov_b32 s2, s33
+; GISEL64-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GISEL64-NEXT:    s_mov_b32 s3, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s32
+; GISEL64-NEXT:    s_addk_co_i32 s32, 0x800
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GISEL64-NEXT:    s_wait_storecnt 0x0
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_mov_b32 s32, s34
+; GISEL64-NEXT:    s_mov_b32 s34, s3
+; GISEL64-NEXT:    s_mov_b64 exec, s[0:1]
+; GISEL64-NEXT:    s_mov_b32 s33, s2
+; GISEL64-NEXT:    s_wait_alu 0xfffe
+; GISEL64-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-DAGISEL-LABEL: realign_stack:
+; GFX1250-DAGISEL:       ; %bb.0:
+; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s1, s33
+; GFX1250-DAGISEL-NEXT:    s_add_co_i32 s33, s32, 0x3ff
+; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-DAGISEL-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s2, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s32
+; GFX1250-DAGISEL-NEXT:    s_addk_co_i32 s32, 0x800
+; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 scope:SCOPE_SYS
+; GFX1250-DAGISEL-NEXT:    s_wait_storecnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, s34
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s34, s2
+; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s1
+; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %fussy = alloca i32, align 1024, addrspace(5)
+  store volatile i32 %x, ptr addrspace(5) %fussy, align 1024
+  ret void
+}
+
 define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-LABEL: multiple_blocks:
 ; DAGISEL:       ; %bb.0:
diff --git a/llvm/test/CodeGen/BPF/BTF/variant-part.ll b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
new file mode 100644
index 0000000..1071e61
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/variant-part.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub enum MyEnum {
+;       First { a: u32, b: i32 },
+;       Second(u32),
+;   }
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyEnum = MyEnum::First { a: 54, b: -23 };
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o variant-part.bc
+;   llvm-dis variant-part.bc -o variant-part.ll
+
+; ModuleID = 'variant-part.bc'
+source_filename = "c0znihgkvro8hs0n88fgrtg6x"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [12 x i8] c"\00\00\00\006\00\00\00\E9\FF\FF\FF", align 4, !dbg !0
+
+!llvm.module.flags = !{!22, !23, !24, !25}
+!llvm.ident = !{!26}
+!llvm.dbg.cu = !{!27}
+
+; CHECK-BTF:      [1] STRUCT 'MyEnum' size=12 vlen=1
+; CHECK-BTF-NEXT:         '(anon)' type_id=3 bits_offset=0
+; CHECK-BTF-NEXT: [2] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [3] UNION '(anon)' size=12 vlen=3
+; CHECK-BTF-NEXT:         '(anon)' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'First' type_id=4 bits_offset=0
+; CHECK-BTF-NEXT:         'Second' type_id=6 bits_offset=0
+; CHECK-BTF-NEXT: [4] STRUCT 'First' size=12 vlen=2
+; CHECK-BTF-NEXT:         'a' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT:         'b' type_id=5 bits_offset=64
+; CHECK-BTF-NEXT: [5] INT 'i32' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [6] STRUCT 'Second' size=12 vlen=1
+; CHECK-BTF-NEXT:         '__0' type_id=2 bits_offset=32
+; CHECK-BTF-NEXT: [7] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [8] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=7 offset=0 size=12
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 10, type: !4, isLocal: false, isDefinition: true, align: 32)
+!2 = !DINamespace(name: "variant_part", scope: null)
+!3 = !DIFile(filename: "variant-part/src/main.rs", directory: "/tmp/variant-part", checksumkind: CSK_MD5, checksum: "b94cd53886ea8f14cbc116b36bc7dd36")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyEnum", scope: !2, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !6, templateParams: !16, identifier: "faba668fd9f71e9b7cf3b9ac5e8b93cb")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DICompositeType(tag: DW_TAG_variant_part, scope: !4, file: !5, size: 96, align: 32, elements: !8, templateParams: !16, identifier: "e4aee046fc86d111657622fdcb8c42f7", discriminator: !21)
+!8 = !{!9, !17}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "First", scope: !7, file: !5, baseType: !10, size: 96, align: 32, extraData: i32 0)
+!10 = !DICompositeType(tag: DW_TAG_structure_type, name: "First", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !11, templateParams: !16, identifier: "cc7748c842e275452db4205b190c8ff7")
+!11 = !{!12, !14}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !10, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!13 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !10, file: !5, baseType: !15, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!15 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
+!16 = !{}
+!17 = !DIDerivedType(tag: DW_TAG_member, name: "Second", scope: !7, file: !5, baseType: !18, size: 96, align: 32, extraData: i32 1)
+!18 = !DICompositeType(tag: DW_TAG_structure_type, name: "Second", scope: !4, file: !5, size: 96, align: 32, flags: DIFlagPublic, elements: !19, templateParams: !16, identifier: "a2094b1381f3082d504fbd0903aa7c06")
+!19 = !{!20}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "__0", scope: !18, file: !5, baseType: !13, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!21 = !DIDerivedType(tag: DW_TAG_member, scope: !4, file: !5, baseType: !13, size: 32, align: 32, flags: DIFlagArtificial)
+!22 = !{i32 8, !"PIC Level", i32 2}
+!23 = !{i32 7, !"PIE Level", i32 2}
+!24 = !{i32 7, !"Dwarf Version", i32 4}
+!25 = !{i32 2, !"Debug Info Version", i32 3}
+!26 = !{!"rustc version 1.91.0-nightly (160e7623e 2025-08-26)"}
+!27 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !28, producer: "clang LLVM (rustc version 1.91.0-nightly (160e7623e 2025-08-26))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !29, splitDebugInlining: false, nameTableKind: None)
+!28 = !DIFile(filename: "variant-part/src/main.rs/@/c0znihgkvro8hs0n88fgrtg6x", directory: "/tmp/variant-part")
+!29 = !{!0}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
index 6eece70..26ccf59 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -149,10 +149,10 @@ define void @f3(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]]), !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]]), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]]), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
@@ -202,10 +202,10 @@ define void @f4(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]])
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]])
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
@@ -255,10 +255,10 @@ define void @f5(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-NEXT:    [[CUP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.vlalignb.128B(<32 x i32> zeroinitializer, <32 x i32> splat (i32 -1), i32 [[PTI1]])
 ; CHECK-NEXT:    [[CST17:%.*]] = bitcast <32 x i32> [[CUP16]] to <128 x i8>
 ; CHECK-NEXT:    [[TRN:%.*]] = trunc <128 x i8> [[CST6]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr [[ITP]], i32 128, <128 x i1> [[TRN]]), !tbaa [[TBAA5]]
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST4]], ptr align 128 [[ITP]], <128 x i1> [[TRN]]), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[TRN18:%.*]] = trunc <128 x i8> [[CST12]] to <128 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr [[GEP]], i32 128, <128 x i1> [[TRN18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[CST10]], ptr align 128 [[GEP]], <128 x i1> [[TRN18]])
 ; CHECK-NEXT:    [[GEP19:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
 ; CHECK-NEXT:    [[AND20:%.*]] = and i32 [[PTI1]], 127
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND20]], 0
diff --git a/llvm/test/CodeGen/Hexagon/insert-big.ll b/llvm/test/CodeGen/Hexagon/insert-big.ll
new file mode 100644
index 0000000..8735a66
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/insert-big.ll
@@ -0,0 +1,47 @@
+; Check that llc does not abort, which happened due to incorrect MIR.
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 < %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 < %s
+
+; Look for this symptom, in case llc does not check invalid IR.
+; CHECK-NOT: insert(%14,%5,#5,#5)
+
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=1 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=2 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=3 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=4 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+; RUN: llc -O2 -mtriple=hexagon -insert-max-ifmap=5 -debug-only=hexinsert -stop-after hexinsert < %s 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+define i32 @f(i32 %0, i32 %1, i32 %2) {
+entry:
+  switch i32 %0, label %common.ret1 [
+    i32 8907, label %3
+    i32 4115, label %6
+  ]
+
+common.ret1:
+  %common.ret1.op = phi i32 [ %5, %3 ], [ %526, %6 ], [ 0, %entry ]
+  ret i32 %common.ret1.op
+
+3:
+  %4 = shl i32 %2, 5
+  %5 = and i32 %4, 992
+  br label %common.ret1
+
+6:
+  %7 = shl i32 %0, 10
+  %8 = and i32 %7, 7168
+  %9 = shl i32 %0, 5
+  %10 = and i32 %9, 992
+  %11 = or i32 %10, %8
+  %12 = and i32 %0, 1
+  %13 = or i32 %11, %12
+  %14 = shl i32 %1, 1
+  %15 = and i32 %14, 2031616
+  %526 = or i32 %13, %15
+  br label %common.ret1
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-conv.ll b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
new file mode 100644
index 0000000..d2d393e
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-conv.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=hexagon -mattr=+hvxv68,+hvx,+hvx-length128b < %s | FileCheck %s
+
+; Test that the Qfloat optimization pass doesn't crash due to an invalid
+; instructions.
+
+; CHECK: v{{[0-9]+}}.hf = v{{[0-9]:[0-9]}}.qf32
+
+define void @test(
+    <32 x i32>* %optr,
+    <64 x i32> %in64,
+    <32 x i32> %va,
+    <32 x i32> %vb
+) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %optr.068 = phi <32 x i32>* [ %optr, %entry ], [ %incdec.ptr6, %for.body ]
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32> %in64) #2
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32> %0) #2
+  %2 = tail call <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32> %va, <32 x i32> %1) #2
+  %3 = tail call <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1> %2, <32 x i32> %va, <32 x i32> %vb) #2
+  %4 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %3, <32 x i32> %vb) #2
+  %5 = tail call <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32> %va, <32 x i32> %4) #2
+  store <32 x i32> %5, <32 x i32>* %optr.068, align 1
+  %incdec.ptr6 = getelementptr inbounds <32 x i32>, <32 x i32>* %optr.068, i32 1
+  br label %for.body
+}
+
+declare <32 x i32> @llvm.hexagon.V6.vdealh.128B(<32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vconv.hf.qf32.128B(<64 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vpackhub.sat.128B(<32 x i32>, <32 x i32>) #1
+declare <128 x i1> @llvm.hexagon.V6.vgth.128B(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vmux.128B(<128 x i1>, <32 x i32>, <32 x i32>) #1
diff --git a/llvm/test/CodeGen/Hexagon/qfp-enabled.ll b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
new file mode 100644
index 0000000..a5cc5fa
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-enabled.ll
@@ -0,0 +1,19 @@
+; Tests if the flag to disable qfp optimizer pass works or not.
+
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: < %s -o -| FileCheck %s --check-prefix=ENABLED
+; RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b \
+; RUN: -disable-qfp-opt < %s -o -| FileCheck %s --check-prefix=DISABLED
+
+define dso_local <32 x i32> @conv1_qf32(<32 x i32> noundef %input1, <32 x i32> noundef %input2) local_unnamed_addr {
+entry:
+; DISABLED: [[V2:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; DISABLED: [[V3:v[0-9]+]].sf = [[V2]].qf32
+; DISABLED: qf32 = vadd(v0.sf,[[V3]].sf)
+; ENABLED: [[V4:v[0-9]+]].qf32 = vadd(v0.sf,v1.sf)
+; ENABLED: qf32 = vadd([[V4]].qf32,v0.sf)
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %input2)
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vconv.sf.qf32.128B(<32 x i32> %0)
+  %2 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.128B(<32 x i32> %input1, <32 x i32> %1)
+  ret <32 x i32> %2
+}
diff --git a/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
new file mode 100644
index 0000000..d8dde7d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-remove-kill.mir
@@ -0,0 +1,95 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -run-pass machineverifier %s -o - | FileCheck %s
+
+# Test that the killed RegState from DefMI operands are removed
+# killed RegState should be set for MI operands
+# CHECK-LABEL: name: qfpAdd
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG1]], killed %[[REG2]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG3:([0-9]+)]]
+# CHECK-NEXT: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG4:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32 killed %[[REG3]], killed %[[REG4]]
+
+---
+name: qfpAdd
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %7:hvxvr = V6_vL32Ub_ai %3:intregs, 0
+    %8:hvxvr = V6_vconv_sf_qf32 killed %4:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 killed %5:hvxvr
+    %10:hvxvr = V6_vadd_sf %8:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %12:hvxvr = V6_vconv_sf_qf32 killed %7:hvxvr
+    %13:hvxvr = V6_vadd_sf killed %11:hvxvr, killed %12:hvxvr
+...
+
+
+# Test that the killed RegState from DefMI operands are removed
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf %3:hvxvr, killed %10:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpAddSwapMix
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG1:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG1]], %{{[0-9]+}}
+# CHECK: %{{[0-9]+}}:hvxvr = V6_vconv_sf_qf32 %[[REG2:([0-9]+)]]
+# CHECK-NEXT: V6_vadd_qf32_mix killed %[[REG2]], %{{[0-9]+}}
+
+---
+name: qfpAddSwapMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 killed %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %7:hvxvr, %3:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %10:hvxvr = V6_vconv_sf_qf32 killed %9:hvxvr
+    %11:hvxvr = V6_vadd_sf killed %10:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
new file mode 100644
index 0000000..1d78203
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfp-subreg-bug.mir
@@ -0,0 +1,33 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv69 -mattr=+hvxv69,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK: V6_vshuffvdd
+# CHECK: V6_vadd_sf
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_lo
+# CHECK: V6_vadd_qf32_mix{{.*}}vsub_hi
+
+---
+name:            qfp_subreg_fix
+alignment:       16
+tracksRegLiveness: true
+
+body:             |
+  bb.0:
+    %10:intregs = IMPLICIT_DEF
+    %9:hvxvr = V6_vL32Ub_ai %10, 0 :: (load (s1024) from `ptr undef`, align 4)
+    %11:intregs = A2_tfrsi 15360
+    %12:hvxvr = V6_lvsplath %11
+    %13:hvxwr = V6_vmpy_qf32_hf %9, %12
+    %15:hvxvr = V6_vconv_sf_qf32 %13.vsub_lo
+    %17:hvxvr = V6_vconv_sf_qf32 %13.vsub_hi
+    %18:intregslow8 = A2_tfrsi -4
+    %19:hvxwr = V6_vshuffvdd %17, %15, %18
+    %21:hvxvr = V6_vadd_sf %19.vsub_hi, %19.vsub_hi
+    %22:hvxvr = V6_vconv_sf_qf32 %21
+    %24:hvxvr = V6_vadd_sf %19.vsub_lo, %19.vsub_lo
+    %25:hvxvr = V6_vconv_sf_qf32 %24
+    %26:hvxvr = V6_vadd_sf %25, %19.vsub_lo
+    %27:hvxvr = V6_vconv_sf_qf32 %26
+    %28:hvxvr = V6_vadd_sf %22, %19.vsub_hi
+    %29:hvxvr = V6_vconv_sf_qf32 %28
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
new file mode 100644
index 0000000..c16370c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@@ -0,0 +1,21 @@
+; Tests if generated vadd instruction takes in qf32
+; type as first parameter instead of a sf type without
+; any conversion instruction of type sf = qf32
+
+; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+
+; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
+; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
+; CHECK: [[V1:v[0-9]+]].qf32 = vmpy([[V1]].sf,[[V2]].sf)
+; CHECK: [[V4:v[0-9]+]].qf32 = vadd([[V0]].qf32,[[V2]].sf)
+; CHECK: [[V5:v[0-9]+]].qf32 = vadd([[V1]].qf32,[[V2]].sf)
+
+define void @_Z19compute_ripple_geluIDF16_EviPT_PKS0_(ptr %out_ptr, <64 x float> %conv14.ripple.vectorized) #0 {
+entry:
+  %mul16.ripple.vectorized = fmul <64 x float> %conv14.ripple.vectorized, zeroinitializer
+  %conv17.ripple.vectorized = fptrunc <64 x float> %mul16.ripple.vectorized to <64 x half>
+  store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
+  ret void
+}
+
+attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
new file mode 100644
index 0000000..9a9e938
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-mix.mir
@@ -0,0 +1,79 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# Test that the operands are swapped for Add if the second operand
+# is a qf32 to sf conversion. V6_vadd_qf32_mix supports first operand
+# as qf32.
+# CHECK-LABEL: name: qfpAddMix
+# CHECK: %[[REG:([0-9]+)]]:hvxvr = V6_vmpy_qf32_sf
+# CHECK: V6_vadd_qf32_mix %[[REG]]
+
+---
+name: qfpAddMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do not generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as second operand. As sub is not commutative, we should not
+# generate the mix instruction.
+# CHECK-LABEL: name: qfpSubNoMix
+# CHECK-NOT: V6_vsub_qf32_mix
+
+---
+name: qfpSubNoMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %3:hvxvr, %7:hvxvr
+...
+
+
+# Test that we do generate V6_vsub_qf32_mix for the below test.
+# V6_vsub_qf32_mix only allowes qf32 as first operand. In the test qf32
+# is passed as first operand. So, V6_vsub_qf32_mix must be generated.
+# CHECK-LABEL: name: qfpSubMix
+# CHECK: V6_vsub_qf32_mix
+
+---
+name: qfpSubMix
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %4:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %6:hvxvr = V6_vmpy_qf32_sf %4, %5
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %7:hvxvr, %3:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
new file mode 100644
index 0000000..f0b1d3c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/qfp-zeroinit.mir
@@ -0,0 +1,23 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+# CHECK-LABEL: name: qfpAdd32
+# CHECK: V6_vd0
+# CHECK-NEXT: V6_vL32Ub_ai
+# CHECK-NEXT: V6_vadd_sf
+# CHECK-NEXT: V6_vconv_sf_qf32
+# CHECK-NEXT: V6_vS32Ub_ai
+---
+name: qfpAdd32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %3:hvxvr = V6_vd0
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vadd_sf %3:hvxvr, %4:hvxvr
+    %6:hvxvr = V6_vconv_sf_qf32 %5:hvxvr
+    V6_vS32Ub_ai %1:intregs, 0, %6:hvxvr
+...
diff --git a/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
new file mode 100644
index 0000000..2d46da7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vect/unique-vreg-def.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: hexagon
+
+; This test was asserting because getVRegDef() was called on a register with
+; multiple defs.
+; Checks that the test does not assert and vsub is generated.
+; CHECK: vsub
+
+target triple = "hexagon"
+
+@v = common dso_local local_unnamed_addr global <32 x i32> zeroinitializer, align 128
+
+; Function Attrs: nounwind
+define dso_local void @hvx_twoSum(<32 x i32>* nocapture noundef writeonly %s2lo) local_unnamed_addr #0 {
+entry:
+  %0 = load <32 x i32>, <32 x i32>* @v, align 128
+  %call = tail call inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef %0) #3
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32> %call, <32 x i32> %call)
+  store <32 x i32> %1, <32 x i32>* @v, align 128
+  store <32 x i32> %1, <32 x i32>* %s2lo, align 128
+  ret void
+}
+
+declare dso_local inreg <32 x i32> @MY_Vsf_equals_Vqf32(<32 x i32> noundef) local_unnamed_addr #1
+
+; Function Attrs: nofree nosync nounwind readnone
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.128B(<32 x i32>, <32 x i32>) #2
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv73" "target-features"="+hvx-length128b,+hvxv73,+v73,-long-calls" }
+attributes #2 = { nofree nosync nounwind readnone }
+attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/LoongArch/lasx/abs.ll b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
index e3b0d04d..dd60c67 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/abs.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
@@ -6,8 +6,7 @@ define void @vabs_b(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.b $xr1, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.b $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -21,8 +20,7 @@ define void @vabs_b_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.b $xr1, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.b $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -36,8 +34,7 @@ define void @vabs_h(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.h $xr1, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.h $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -51,8 +48,7 @@ define void @vabs_h_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.h $xr1, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.h $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -66,8 +62,7 @@ define void @vabs_w(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.w $xr1, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.w $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,8 +76,7 @@ define void @vabs_w_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.w $xr1, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.w $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -96,8 +90,7 @@ define void @vabs_d(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.d $xr1, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.d $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +104,7 @@ define void @vabs_d_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
-; CHECK-NEXT:    xvneg.d $xr1, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvsigncov.d $xr0, $xr0, $xr0
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
index 37b62ca..69437a2 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
@@ -33,6 +33,18 @@ define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
     ret <8 x i32> %c
 }
 
+;; xvshuf4i.d
+define <4 x i64> @shufflevector_xvshuf4i_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvbsrl.v $xr0, $xr0, 8
+; CHECK-NEXT:    xvbsll.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
+    ret <4 x i64> %c
+}
+
 ;; xvshuf4i.w
 define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
@@ -42,3 +54,16 @@ define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b)
     %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
     ret <8 x float> %c
 }
+
+;; xvshuf4i.d
+define <4 x double> @shufflevector_xvshuf4i_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: shufflevector_xvshuf4i_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
+; CHECK-NEXT:    ret
+    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+    ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll b/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll
new file mode 100644
index 0000000..b57d90c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/shufflevector-reverse.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @shufflevector_reverse_v32i8(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr1, $a1, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf.b $xr0, $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <32 x i8>, ptr %a
+  %b = shufflevector <32 x i8> %va, <32 x i8> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <32 x i8> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v16i16(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    xvld $xr1, $a1, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf.h $xr1, $xr0, $xr0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i16>, ptr %a
+  %b = shufflevector <16 x i16> %va, <16 x i16> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <16 x i16> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8i32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %b = shufflevector <8 x i32> %va, <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x i32> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4i64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %b = shufflevector <4 x i64> %va, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i64> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8f32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %b = shufflevector <8 x float> %va, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x float> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4f64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 27
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %b = shufflevector <4 x double> %va, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x double> %b, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/abs.ll b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
index 85fe1fe..6f8ccf4 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/abs.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
@@ -6,8 +6,7 @@ define void @vabs_b(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.b $vr1, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -21,8 +20,7 @@ define void @vabs_b_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_b_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.b $vr1, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -36,8 +34,7 @@ define void @vabs_h(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.h $vr1, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -51,8 +48,7 @@ define void @vabs_h_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_h_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.h $vr1, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -66,8 +62,7 @@ define void @vabs_w(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.w $vr1, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,8 +76,7 @@ define void @vabs_w_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_w_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.w $vr1, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -96,8 +90,7 @@ define void @vabs_d(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.d $vr1, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.d $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -111,8 +104,7 @@ define void @vabs_d_1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: vabs_d_1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
-; CHECK-NEXT:    vneg.d $vr1, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsigncov.d $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll
new file mode 100644
index 0000000..29f038a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/shufflevector-reverse.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @shufflevector_reverse_v16i8(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <16 x i8>, ptr %a
+  %b = shufflevector <16 x i8> %va, <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <16 x i8> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v8i16(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
+; CHECK-NEXT:    vld $vr1, $a1, %pc_lo12(.LCPI1_0)
+; CHECK-NEXT:    vshuf.h $vr1, $vr0, $vr0
+; CHECK-NEXT:    vst $vr1, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i16>, ptr %a
+  %b = shufflevector <8 x i16> %va, <8 x i16> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x i16> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4i32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i32>, ptr %a
+  %b = shufflevector <4 x i32> %va, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x i32> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v2i64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.d $vr0, $vr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x i64>, ptr %a
+  %b = shufflevector <2 x i64> %va, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x i64> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v4f32(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x float>, ptr %a
+  %b = shufflevector <4 x float> %va, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  store <4 x float> %b, ptr %res
+  ret void
+}
+
+define void @shufflevector_reverse_v2f64(ptr %res, ptr %a) nounwind {
+; CHECK-LABEL: shufflevector_reverse_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vshuf4i.d $vr0, $vr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <2 x double>, ptr %a
+  %b = shufflevector <2 x double> %va, <2 x double> poison, <2 x i32> <i32 1, i32 0>
+  store <2 x double> %b, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index dcb70f8..f9527ef 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -45,6 +45,32 @@ define i32 @bclr_i32_no_mask(i32 %a, i32 %b) nounwind {
   ret i32 %and1
 }
 
+define i32 @bclr_i32_mask_multiple(i32 %a, i32 %b, i32 %shamt) nounwind {
+; RV32I-LABEL: bclr_i32_mask_multiple:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a3, 1
+; RV32I-NEXT:    sll a2, a3, a2
+; RV32I-NEXT:    not a3, a2
+; RV32I-NEXT:    and a0, a3, a0
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBS-LABEL: bclr_i32_mask_multiple:
+; RV32ZBS:       # %bb.0:
+; RV32ZBS-NEXT:    bclr a0, a0, a2
+; RV32ZBS-NEXT:    bset a1, a1, a2
+; RV32ZBS-NEXT:    add a0, a0, a1
+; RV32ZBS-NEXT:    ret
+  %shamt_masked = and i32 %shamt, 63
+  %shl = shl nuw i32 1, %shamt_masked
+  %neg = xor i32 %shl, -1
+  %and = and i32 %neg, %a
+  %or = or i32 %b, %shl
+  %c = add i32 %and, %or
+  ret i32 %c
+}
+
 define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: bclr_i64:
 ; RV32I:       # %bb.0:
@@ -301,17 +327,17 @@ define i64 @bext_i64(i64 %a, i64 %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a3, a2, 63
 ; CHECK-NEXT:    addi a4, a3, -32
-; CHECK-NEXT:    bltz a4, .LBB12_2
+; CHECK-NEXT:    bltz a4, .LBB13_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    srl a0, a1, a3
-; CHECK-NEXT:    j .LBB12_3
-; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    j .LBB13_3
+; CHECK-NEXT:  .LBB13_2:
 ; CHECK-NEXT:    srl a0, a0, a2
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    not a2, a3
 ; CHECK-NEXT:    sll a1, a1, a2
 ; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:  .LBB12_3:
+; CHECK-NEXT:  .LBB13_3:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    ret
@@ -789,17 +815,17 @@ define i64 @bset_trailing_ones_i64_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a3, -1
 ; CHECK-NEXT:    addi a1, a2, -32
 ; CHECK-NEXT:    sll a0, a3, a0
-; CHECK-NEXT:    bltz a1, .LBB43_2
+; CHECK-NEXT:    bltz a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sll a2, a3, a2
-; CHECK-NEXT:    j .LBB43_3
-; CHECK-NEXT:  .LBB43_2:
+; CHECK-NEXT:    j .LBB44_3
+; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    not a2, a2
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a2, a3, a2
 ; CHECK-NEXT:    or a2, a0, a2
-; CHECK-NEXT:  .LBB43_3:
+; CHECK-NEXT:  .LBB44_3:
 ; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    not a1, a2
@@ -817,17 +843,17 @@ define i64 @bset_trailing_ones_i64_no_mask(i64 %a) nounwind {
 ; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    addi a2, a0, -32
 ; CHECK-NEXT:    sll a1, a1, a0
-; CHECK-NEXT:    bltz a2, .LBB44_2
+; CHECK-NEXT:    bltz a2, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:    j .LBB44_3
-; CHECK-NEXT:  .LBB44_2:
+; CHECK-NEXT:    j .LBB45_3
+; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    not a0, a0
 ; CHECK-NEXT:    lui a3, 524288
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    srl a0, a3, a0
 ; CHECK-NEXT:    or a0, a1, a0
-; CHECK-NEXT:  .LBB44_3:
+; CHECK-NEXT:  .LBB45_3:
 ; CHECK-NEXT:    srai a2, a2, 31
 ; CHECK-NEXT:    and a2, a2, a1
 ; CHECK-NEXT:    not a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64zbs.ll
index b4edcf6c..d42bc8e 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbs.ll
@@ -110,6 +110,32 @@ define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
   ret i64 %and1
 }
 
+define i64 @bclr_i64_mask_multiple(i64 %a, i64 %b, i64 %shamt) nounwind {
+; RV64I-LABEL: bclr_i64_mask_multiple:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    sll a2, a3, a2
+; RV64I-NEXT:    not a3, a2
+; RV64I-NEXT:    and a0, a3, a0
+; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclr_i64_mask_multiple:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclr a0, a0, a2
+; RV64ZBS-NEXT:    bset a1, a1, a2
+; RV64ZBS-NEXT:    add a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shamt_masked = and i64 %shamt, 63
+  %shl = shl nuw i64 1, %shamt_masked
+  %neg = xor i64 %shl, -1
+  %and = and i64 %neg, %a
+  %or = or i64 %b, %shl
+  %c = add i64 %and, %or
+  ret i64 %c
+}
+
 define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: bset_i32:
 ; RV64I:       # %bb.0:
@@ -372,19 +398,19 @@ define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    beqz a0, .LBB19_2
+; RV64I-NEXT:    beqz a0, .LBB20_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:  .LBB20_2:
 ; RV64I-NEXT:    tail bar
 ;
 ; RV64ZBS-LABEL: bext_i32_trunc:
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    bext a0, a0, a1
-; RV64ZBS-NEXT:    beqz a0, .LBB19_2
+; RV64ZBS-NEXT:    beqz a0, .LBB20_2
 ; RV64ZBS-NEXT:  # %bb.1:
 ; RV64ZBS-NEXT:    ret
-; RV64ZBS-NEXT:  .LBB19_2:
+; RV64ZBS-NEXT:  .LBB20_2:
 ; RV64ZBS-NEXT:    tail bar
   %3 = shl i32 1, %1
   %4 = and i32 %3, %0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
index bbdacac..8f7d738 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-negative.ll
@@ -15,7 +15,7 @@ define void @gather_bad_or(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], splat (i64 5)
 ; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], splat (i64 1)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i64> [[OR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -60,7 +60,7 @@ define void @gather_narrow_index(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw <32 x i32> [[VEC_IND]], splat (i32 5)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i32> [[TMP0]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[TMP1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[TMP1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -105,7 +105,7 @@ define void @gather_broken_stride(ptr noalias nocapture %A, ptr noalias nocaptur
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], splat (i64 5)
 ; CHECK-NEXT:    [[OR:%.*]] = or <32 x i64> [[I]], splat (i64 1)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i64> [[OR]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 4a52546..62b65dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -370,7 +370,7 @@ define void @negative_shl_non_commute(ptr noalias nocapture %A, ptr noalias noca
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I:%.*]] = shl nsw <8 x i64> [[DOTSPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i64> [[I]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[I1]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[I1]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
@@ -663,8 +663,8 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    [[I6:%.*]] = add <2 x i64> [[I5]], splat (i64 10)
 ; ZVE32F-NEXT:    [[I7:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], <2 x i64> [[I4]]
 ; ZVE32F-NEXT:    [[I8:%.*]] = getelementptr inbounds ptr, ptr [[ARG1]], <2 x i64> [[I6]]
-; ZVE32F-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I7]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison)
-; ZVE32F-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I8]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> poison)
+; ZVE32F-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> align 8 [[I7]], <2 x i1> splat (i1 true), <2 x ptr> poison)
+; ZVE32F-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> align 8 [[I8]], <2 x i1> splat (i1 true), <2 x ptr> poison)
 ; ZVE32F-NEXT:    [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
 ; ZVE32F-NEXT:    store <2 x ptr> [[I9]], ptr [[I11]], align 8
 ; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
@@ -744,8 +744,8 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    [[I12:%.*]] = add <2 x i64> [[I11]], splat (i64 10)
 ; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], <2 x i64> [[I10]]
 ; ZVE32F-NEXT:    [[I14:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], <2 x i64> [[I12]]
-; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> [[I13]], i32 8, <2 x i1> splat (i1 true))
-; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> [[I14]], i32 8, <2 x i1> splat (i1 true))
+; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> align 8 [[I13]], <2 x i1> splat (i1 true))
+; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> align 8 [[I14]], <2 x i1> splat (i1 true))
 ; ZVE32F-NEXT:    [[I15]] = add nuw i64 [[I]], 4
 ; ZVE32F-NEXT:    [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
 ; ZVE32F-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
@@ -975,7 +975,7 @@ define void @gather_narrow_idx(ptr noalias nocapture %A, ptr noalias nocapture r
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i16> [[VEC_IND]], splat (i16 5)
 ; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i16> [[I]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> align 1 [[I1]], <32 x i1> splat (i1 true), <32 x i8> poison)
 ; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
 ; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index f54ab9f..d801c51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -507,7 +507,7 @@ define <vscale x 1 x i64> @neg_shl_is_not_commutative(ptr %p) {
 ; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[SPLAT_INSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[OFFSET:%.*]] = shl <vscale x 1 x i64> [[SPLAT]], [[STEP]]
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 1 x i64> [[OFFSET]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
@@ -571,7 +571,7 @@ define void @scatter_loopless(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 define void @constant_stride(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
 ; CHECK-LABEL: @constant_stride(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 1 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x ptr> [[PTRS]], i32 8, <vscale x 1 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x ptr> align 8 [[PTRS]], <vscale x 1 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr i32, ptr %p, <vscale x 1 x i64> zeroinitializer
@@ -627,7 +627,7 @@ define <vscale x 1 x i64> @nonstrided_base_scalar_offset(ptr %p, <vscale x 1 x i
 ; CHECK-LABEL: @nonstrided_base_scalar_offset(
 ; CHECK-NEXT:    [[PTRS1:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 1 x i64> [[V:%.*]]
 ; CHECK-NEXT:    [[PTRS2:%.*]] = getelementptr i64, <vscale x 1 x ptr> [[PTRS1]], i64 [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS2]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS2]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %ptrs1 = getelementptr i64, ptr %p, <vscale x 1 x i64> %v
@@ -647,7 +647,7 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
 ; CHECK-NEXT:    [[STEP:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
 ; CHECK-NEXT:    [[PTRS1:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 1 x i64> [[STEP]]
 ; CHECK-NEXT:    [[PTRS2:%.*]] = getelementptr i64, <vscale x 1 x ptr> [[PTRS1]], <vscale x 1 x i64> [[OFFSET:%.*]]
-; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS2]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[X:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS2]], <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
index 98dc88f..baab5ac 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-14.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -385,15 +389,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-15.ll b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
index 0d8ee75..f2c9ee5 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-15.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-15.ll
@@ -55,14 +55,15 @@ define i128 @f4(ptr %ptr) {
 }
 
 ; Truncation to i64.
-define i64 @f5(i128 %a) {
+define i64 @f5(i128 %a, i128 %b) {
 ; CHECK-LABEL: f5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvg %r2, %v0, 1
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i64
   ret i64 %res
 }
@@ -134,15 +135,16 @@ define i128 @f10(ptr %ptr) {
 }
 
 ; Truncation to i32.
-define i32 @f11(i128 %a) {
+define i32 @f11(i128 %a, i128 %b) {
 ; CHECK-LABEL: f11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i32
   ret i32 %res
 }
@@ -215,15 +217,16 @@ define i128 @f16(ptr %ptr) {
 }
 
 ; Truncation to i16.
-define i16 @f17(i128 %a) {
+define i16 @f17(i128 %a, i128 %b) {
 ; CHECK-LABEL: f17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i16
   ret i16 %res
 }
@@ -296,15 +299,16 @@ define i128 @f22(ptr %ptr) {
 }
 
 ; Truncation to i8.
-define i8 @f23(i128 %a) {
+define i8 @f23(i128 %a, i128 %b) {
 ; CHECK-LABEL: f23:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i8
   ret i8 %res
 }
@@ -383,15 +387,16 @@ define i128 @f28(ptr %ptr) {
 }
 
 ; Truncation to i1.
-define i1 @f29(i128 %a) {
+define i1 @f29(i128 %a, i128 %b) {
 ; CHECK-LABEL: f29:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vl %v0, 0(%r2), 3
-; CHECK-NEXT:    vaq %v0, %v0, %v0
+; CHECK-NEXT:    vl %v0, 0(%r3), 3
+; CHECK-NEXT:    vl %v1, 0(%r2), 3
+; CHECK-NEXT:    vaq %v0, %v1, %v0
 ; CHECK-NEXT:    vlgvf %r2, %v0, 3
 ; CHECK-NEXT:    # kill: def $r2l killed $r2l killed $r2d
 ; CHECK-NEXT:    br %r14
-  %op = add i128 %a, %a
+  %op = add i128 %a, %b
   %res = trunc i128 %op to i1
   ret i1 %res
 }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
index 9621897..b6f4d40 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
@@ -36,7 +36,7 @@ define hidden i32 @_Z4loopPiPjiS0_i(ptr noalias nocapture readonly %s1, ptr noal
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP9]] = sub i32 [[TMP5]], 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT72]], ptr [[LSR_IV9]], i32 4, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT72]], ptr align 4 [[LSR_IV9]], <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP10]] = getelementptr i32, ptr [[LSR_IV9]], i32 4
 ; CHECK-NEXT:    [[TMP10]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP4]], i32 1)
@@ -54,10 +54,10 @@ define hidden i32 @_Z4loopPiPjiS0_i(ptr noalias nocapture readonly %s1, ptr noal
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> undef, i32 [[TRIP_COUNT_MINUS_183]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ule <4 x i32> [[INDUCTION86]], [[TMP14]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV6]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV3]], i32 4, <4 x i1> [[TMP15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV6]], <4 x i1> [[TMP15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD89:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV3]], <4 x i1> [[TMP15]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[WIDE_MASKED_LOAD89]], <4 x i32> [[WIDE_MASKED_LOAD]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP16]], ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP16]], ptr align 4 [[LSR_IV]], <4 x i1> [[TMP15]])
 ; CHECK-NEXT:    [[INDEX_NEXT81]] = add i32 [[INDEX80]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP4]] = getelementptr i32, ptr [[LSR_IV3]], i32 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index 0ddbbb4..c01d4eb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -30,10 +30,10 @@ define void @mat_vec_sext_i16(ptr nocapture readonly %A, ptr nocapture readonly
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i16, ptr [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TT6]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TT6]], <4 x i1> [[TMP1]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TT9:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
 ; CHECK-NEXT:    [[TT10:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TT10]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TT10]], <4 x i1> [[TMP1]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TT12:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD30]] to <4 x i32>
 ; CHECK-NEXT:    [[TT13:%.*]] = mul nsw <4 x i32> [[TT12]], [[TT9]]
 ; CHECK-NEXT:    [[TT14]] = add nsw <4 x i32> [[TT13]], [[VEC_PHI]]
@@ -132,9 +132,9 @@ define void @mat_vec_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, p
 ; CHECK-NEXT:    [[TT6:%.*]] = getelementptr inbounds i32, ptr [[TT3]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TT6]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TT6]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TT9:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TT9]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TT9]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TT11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD29]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TT12]] = add nsw <4 x i32> [[VEC_PHI]], [[TT11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index 11e7e5c..3e8c462 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -23,12 +23,12 @@ define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 16
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP]], <16 x i1> [[TMP1]], <16 x i8> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i8> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 4 [[TMP3]], <16 x i1> [[TMP1]], <16 x i8> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr [[TMP6]], i32 4, <16 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[MUL]], ptr align 4 [[TMP6]], <16 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -92,12 +92,12 @@ define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i16.p0(<8 x i16> [[MUL]], ptr align 4 [[TMP6]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -161,12 +161,12 @@ define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -230,18 +230,18 @@ define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readon
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[EXTRACT_1_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[EXTRACT_1_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[EXTRACT_2_LOW:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[EXTRACT_2_HIGH:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD2]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <2 x i32> [[EXTRACT_1_LOW]], [[EXTRACT_2_LOW]]
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <2 x i32> [[EXTRACT_1_HIGH]], [[EXTRACT_2_HIGH]]
 ; CHECK-NEXT:    [[COMBINE:%.*]] = shufflevector <2 x i32> [[MUL]], <2 x i32> [[SUB]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[COMBINE]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -319,12 +319,12 @@ define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[WRONG]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[WRONG]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -403,12 +403,12 @@ define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[WRONG:%.*]] = icmp ult <4 x i32> [[INDUCTION]], [[BROADCAST_SPLAT11]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[WRONG]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[WRONG]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -495,30 +495,30 @@ define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocaptu
 ; CHECK-NEXT:    [[V9:%.*]] = add i32 [[V8]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK17:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[V9]], i32 [[N]])
 ; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -2
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP42]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP42]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP43:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP43]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV38]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP41:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV38]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD20:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP41]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP34:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -2
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[SCEVGEP34]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP34]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP35:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP35]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[LSR_IV31]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD22:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP35]], <4 x i1> [[ACTIVE_LANE_MASK15]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD23:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV31]], <4 x i1> [[ACTIVE_LANE_MASK16]], <4 x i32> undef)
 ; CHECK-NEXT:    [[SCEVGEP36:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV31]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull [[SCEVGEP36]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[SCEVGEP36]], <4 x i1> [[ACTIVE_LANE_MASK17]], <4 x i32> undef)
 ; CHECK-NEXT:    [[V10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[V11:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD22]], [[WIDE_MASKED_LOAD18]]
 ; CHECK-NEXT:    [[V12:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD23]], [[WIDE_MASKED_LOAD19]]
 ; CHECK-NEXT:    [[V13:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD24]], [[WIDE_MASKED_LOAD20]]
 ; CHECK-NEXT:    [[SCEVGEP27:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -2
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr [[SCEVGEP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V10]], ptr align 4 [[SCEVGEP27]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 -1
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr [[SCEVGEP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK15]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V11]], ptr align 4 [[SCEVGEP28]], <4 x i1> [[ACTIVE_LANE_MASK15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V12]], ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK16]])
 ; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr <4 x i32>, ptr [[LSR_IV]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr [[SCEVGEP29]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V13]], ptr align 4 [[SCEVGEP29]], <4 x i1> [[ACTIVE_LANE_MASK17]])
 ; CHECK-NEXT:    [[SCEVGEP25]] = getelementptr i32, ptr [[LSR_IV]], i32 16
 ; CHECK-NEXT:    [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 16
 ; CHECK-NEXT:    [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 16
@@ -619,10 +619,10 @@ define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 42)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
@@ -692,10 +692,10 @@ define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[INDEX]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
@@ -767,10 +767,10 @@ define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 1, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi i32 [ [[START]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV14]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV14]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD12]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr [[LSR_IV17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP7]], ptr align 4 [[LSR_IV17]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index e70868d..7303efb 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -14,10 +14,10 @@ define dso_local void @foo(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP2]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP2]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
@@ -120,10 +120,10 @@ define dso_local void @foo3(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], splat (i32 32002)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -182,10 +182,10 @@ define dso_local void @foo5(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -239,10 +239,10 @@ define dso_local void @inconsistent_tripcounts(ptr noalias nocapture %A, ptr noa
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 -1)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -294,10 +294,10 @@ define dso_local void @overflow_in_sub(ptr noalias nocapture %A, ptr noalias noc
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -349,10 +349,10 @@ define dso_local void @IV_not_an_induction(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[N:%.*]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -404,10 +404,10 @@ define dso_local void @IV_wrong_step(ptr noalias nocapture %A, ptr noalias nocap
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 3
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -462,10 +462,10 @@ define dso_local void @IV_step_not_constant(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV11]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 4 [[LSR_IV14]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[N:%.*]]
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
@@ -528,10 +528,10 @@ define dso_local void @outerloop_phi(ptr noalias nocapture %A, ptr noalias nocap
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[J_025]], i32 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV33]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV38]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV33]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr [[LSR_IV28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr align 4 [[LSR_IV28]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[SCEVGEP29]] = getelementptr i32, ptr [[LSR_IV28]], i32 4
 ; CHECK-NEXT:    [[SCEVGEP34]] = getelementptr i32, ptr [[LSR_IV33]], i32 4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
index 9775cf9..3271e98 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-reduce.ll
@@ -24,9 +24,9 @@ define i16 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B,
 ; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
 ; CHECK-NEXT:    [[INSTTMP5:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP5]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP5]], <8 x i1> [[TMP5]], <8 x i16> undef)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP8]] = add <8 x i16> [[TMP7]], [[WIDE_MASKED_LOAD3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
@@ -124,7 +124,7 @@ define i16 @reduction_i32_with_scalar(ptr nocapture readonly %A, i16 %B, i32 %N)
 ; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
 ; CHECK-NEXT:    [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
@@ -217,7 +217,7 @@ define i16 @reduction_not_guarded(ptr nocapture readonly %A, i16 %B, i32 %N) loc
 ; CHECK-NEXT:    [[INSTTMP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP4]])
 ; CHECK-NEXT:    [[TMP6]] = sub i32 [[TMP4]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[INSTTMP2]], i32 4, <8 x i1> [[TMP5]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[INSTTMP2]], <8 x i1> [[TMP5]], <8 x i16> undef)
 ; CHECK-NEXT:    [[INSTTMP5:%.*]] = add <8 x i16> [[VEC_PHI]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[INSTTMP6]] = add <8 x i16> [[INSTTMP5]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 8
@@ -311,9 +311,9 @@ define dso_local void @Correlation(ptr nocapture readonly %Input, ptr nocapture
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP13]] = sub i32 [[TMP11]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[LSR_IV]], <4 x i1> [[TMP12]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[LSR_IV48]], i32 2, <4 x i1> [[TMP12]], <4 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD42:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[LSR_IV48]], <4 x i1> [[TMP12]], <4 x i16> undef)
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD42]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[CONV1032]], i32 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index b54d526..88ae227 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -23,14 +23,14 @@ define void @expand_v8i16_v8i32(ptr noalias nocapture readonly %a, ptr noalias n
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[EXPAND_2:%.*]] = zext <8 x i16> [[WIDE_MASKED_LOAD2]] to <8 x i32>
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <8 x i32> [[EXPAND_2]], [[EXPAND_1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr [[TMP6]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[MUL]], ptr align 4 [[TMP6]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -100,9 +100,9 @@ define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i16, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP1]], <8 x i16> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 4 [[TMP3]], <8 x i1> [[TMP1]], <8 x i16> undef)
 ; CHECK-NEXT:    [[EXTRACT_2_LOW:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT_2_HIGH:%.*]] = shufflevector <8 x i16> [[WIDE_MASKED_LOAD2]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <4 x i16> [[EXTRACT_2_LOW]] to <4 x i32>
@@ -114,9 +114,9 @@ define void @expand_v8i16_v4i32(ptr readonly %a, ptr readonly %b, ptr %c, ptr %d
 ; CHECK-NEXT:    [[INDUCTION_STORE:%.*]] = add <4 x i32> [[BROADCAST_SPLAT_STORE]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[STORE_PRED:%.*]] = icmp ule <4 x i32> [[INDUCTION_STORE]], [[BROADCAST_SPLAT11_STORE]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i32 [[STORE_IDX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[STORE_PRED]])
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[D]], i32 [[STORE_IDX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr [[GEP]], i32 4, <4 x i1> [[STORE_PRED]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[SUB]], ptr align 4 [[GEP]], <4 x i1> [[STORE_PRED]])
 ; CHECK-NEXT:    [[STORE_IDX_NEXT]] = add i32 [[STORE_IDX]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
@@ -197,14 +197,14 @@ define void @expand_v4i32_v4i64(ptr noalias nocapture readonly %a, ptr noalias n
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP3]], <4 x i1> [[TMP1]], <4 x i32> undef)
 ; CHECK-NEXT:    [[EXPAND_1:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD]] to <4 x i64>
 ; CHECK-NEXT:    [[EXPAND_2:%.*]] = zext <4 x i32> [[WIDE_MASKED_LOAD2]] to <4 x i64>
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw <4 x i64> [[EXPAND_2]], [[EXPAND_1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[C]], i32 [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr [[TMP6]], i32 4, <4 x i1> [[TMP1]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[MUL]], ptr align 4 [[TMP6]], <4 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP15]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP14]], i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index fb1a4a4..3ae75d2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -25,8 +25,8 @@ define i32 @vec_mul_reduce_add(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i32 [ [[N]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP7]])
 ; CHECK-NEXT:    [[TMP9]] = sub i32 [[TMP7]], 4
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV2]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV2]], <4 x i1> [[TMP8]], <4 x i32> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[LSR_IV]], <4 x i1> [[TMP8]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP11]] = add nsw <4 x i32> [[TMP10]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 314e1b4..91cd3dd 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -14,13 +14,11 @@ define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture n
 ; CHECK:    i32x4.add
 
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
-; MAX-BANDWIDTH: i32x4.dot_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
+; MAX-BANDWIDTH: i16x8.extmul_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.add
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
index 470a30fd..bd4e9a4 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-leading-zeros.mir
@@ -37,9 +37,9 @@ body:             |
     ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]]
     ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32)
+    ; X86-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64)
     ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64)
-    ; X86-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV2]], [[UV4]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT1]](s32), [[UV3]], [[UV5]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
@@ -111,9 +111,9 @@ body:             |
     ; X86-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; X86-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[CTLZ]], [[C1]]
     ; X86-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTLZ_ZERO_UNDEF [[UV1]](s32)
+    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD]](s64)
     ; X86-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[CTLZ_ZERO_UNDEF]](s64)
-    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV2]], [[UV4]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV3]], [[UV5]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
index a7cbb35..6ab424e 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-select.mir
@@ -33,9 +33,9 @@ body:             |
     ; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[DEF2:%[0-9]+]]:_(s1) = IMPLICIT_DEF
+    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
     ; X86-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
-    ; X86-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV]], [[UV2]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ZEXT]](s32), [[UV1]], [[UV3]]
     ; X86-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[SELECT]](s32), [[SELECT1]](s32)
@@ -115,9 +115,9 @@ body:             |
     ; X64: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF
     ; X64-NEXT: [[DEF1:%[0-9]+]]:_(s8) = IMPLICIT_DEF
     ; X64-NEXT: [[DEF2:%[0-9]+]]:_(s1) = IMPLICIT_DEF
+    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF1]](s8)
     ; X64-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[DEF]](s8)
-    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[DEF2]](s1)
     ; X64-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ZEXT]](s32), [[ANYEXT]], [[ANYEXT1]]
     ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[SELECT]](s16)
     ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8)
diff --git a/llvm/test/CodeGen/X86/fcmove.ll b/llvm/test/CodeGen/X86/fcmove.ll
deleted file mode 100644
index 6bb0148..0000000
--- a/llvm/test/CodeGen/X86/fcmove.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s
-
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-unknown"
-
-; Test that we can generate an fcmove, and also that it passes verification.
-
-; CHECK-LABEL: cmove_f
-; CHECK: fcmove %st({{[0-7]}}), %st
-define x86_fp80 @cmove_f(x86_fp80 %a, x86_fp80 %b, i32 %c) {
-  %test = icmp eq i32 %c, 0
-  %add = fadd x86_fp80 %a, %b
-  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
-  ret x86_fp80 %ret
-}
diff --git a/llvm/test/CodeGen/X86/isel-select-fcmov.ll b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
new file mode 100644
index 0000000..cb441b8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-select-fcmov.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=X86-GISEL
+; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=+cmov -fast-isel=0 -global-isel=0 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=X64-GISEL
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel=0 -global-isel=0 | FileCheck %s --check-prefix=X64
+
+; Test that we can generate an fcmove, and also that it passes verification.
+
+define x86_fp80 @cmove_cmp(x86_fp80 %a, x86_fp80 %b, i32 %c) {
+; X86-LABEL: cmove_cmp:
+; X86:       # %bb.0:
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmove %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_cmp:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    andl $1, %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    testl %edi, %edi
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmove %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_cmp:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    xorl %eax, %eax
+; X64-GISEL-NEXT:    cmpl $0, %edi
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    andl $1, %eax
+; X64-GISEL-NEXT:    testl %eax, %eax
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %test = icmp eq i32 %c, 0
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
+
+define x86_fp80 @cmove_arg(x86_fp80 %a, x86_fp80 %b, i1 %test) {
+; X86-LABEL: cmove_arg:
+; X86:       # %bb.0:
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmovne %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_arg:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    movl $1, %eax
+; X86-GISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_arg:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmovne %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_arg:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    andl $1, %edi
+; X64-GISEL-NEXT:    testl %edi, %edi
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
+
+define x86_fp80 @cmove_load(x86_fp80 %a, x86_fp80 %b, ptr %p) {
+; X86-LABEL: cmove_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fadd %st(1), %st
+; X86-NEXT:    cmpb $0, (%eax)
+; X86-NEXT:    fxch %st(1)
+; X86-NEXT:    fcmovne %st(1), %st
+; X86-NEXT:    fstp %st(1)
+; X86-NEXT:    retl
+;
+; X86-GISEL-LABEL: cmove_load:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    fadd %st, %st(1)
+; X86-GISEL-NEXT:    movzbl (%eax), %eax
+; X86-GISEL-NEXT:    andl $1, %eax
+; X86-GISEL-NEXT:    testl %eax, %eax
+; X86-GISEL-NEXT:    fxch %st(1)
+; X86-GISEL-NEXT:    fcmove %st(1), %st
+; X86-GISEL-NEXT:    fstp %st(1)
+; X86-GISEL-NEXT:    retl
+;
+; X64-LABEL: cmove_load:
+; X64:       # %bb.0:
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fadd %st(1), %st
+; X64-NEXT:    cmpb $0, (%rdi)
+; X64-NEXT:    fxch %st(1)
+; X64-NEXT:    fcmovne %st(1), %st
+; X64-NEXT:    fstp %st(1)
+; X64-NEXT:    retq
+;
+; X64-GISEL-LABEL: cmove_load:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-GISEL-NEXT:    fadd %st, %st(1)
+; X64-GISEL-NEXT:    movzbl (%rdi), %eax
+; X64-GISEL-NEXT:    andl $1, %eax
+; X64-GISEL-NEXT:    testl %eax, %eax
+; X64-GISEL-NEXT:    fxch %st(1)
+; X64-GISEL-NEXT:    fcmove %st(1), %st
+; X64-GISEL-NEXT:    fstp %st(1)
+; X64-GISEL-NEXT:    retq
+  %test = load i1, ptr %p
+  %add = fadd x86_fp80 %a, %b
+  %ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
+  ret x86_fp80 %ret
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 3279a50..7a08f3e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -624,6 +624,52 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
   ret void
 }
 
+define i32 @PR164107(<16 x i1> %0) {
+; AVX1-LABEL: PR164107:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX2-LABEL: PR164107:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: PR164107:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    ret{{[l|q]}}
+  %cmp = shufflevector <16 x i1> %0, <16 x i1> zeroinitializer, <16 x i32> zeroinitializer
+  %sext = sext <16 x i1> %cmp to <16 x i64>
+  %bc.1 = bitcast <16 x i64> %sext to <64 x i16>
+  %vecinit15.i = shufflevector <64 x i16> %bc.1, <64 x i16> zeroinitializer, <16 x i32> <i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  %conv16.i = sext <16 x i16> %vecinit15.i to <16 x i64>
+  %bc.2 = bitcast <16 x i64> %conv16.i to <32 x i32>
+  %conv22.i = extractelement <32 x i32> %bc.2, i64 4
+  ret i32 %conv22.i
+}
+
 define <4 x i64> @concat_self_v4i64(<2 x i64> %x) {
 ; AVX1-LABEL: concat_self_v4i64:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
index c8cc871..f5dcf01 100644
--- a/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language-name.ll
@@ -1,3 +1,5 @@
+; AIX doesn't have support for DWARF 6 DW_AT_language_name
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 ; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language"
 
 ; CHECK:     DW_AT_language_name (DW_LNAME_ObjC_plus_plus)
diff --git a/llvm/test/DebugInfo/Generic/compileunit-source-language.ll b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
index bafe620..dc5bf0f 100644
--- a/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
+++ b/llvm/test/DebugInfo/Generic/compileunit-source-language.ll
@@ -1,3 +1,5 @@
+; AIX doesn't have support for DWARF 6 DW_AT_language_name
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 ; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-info - | FileCheck %s --implicit-check-not "DW_AT_language_name"
 
 ; CHECK: DW_AT_language (DW_LANG_C)
diff --git a/llvm/test/DebugInfo/dwarf-complex-int.ll b/llvm/test/DebugInfo/dwarf-complex-int.ll
index effd0ec..84be51e 100644
--- a/llvm/test/DebugInfo/dwarf-complex-int.ll
+++ b/llvm/test/DebugInfo/dwarf-complex-int.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: object-emission
+; AIX doesn't have full support for DWARF 5
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 ; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
 
 ;; https://github.com/llvm/llvm-project/issues/140362
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
index 83d71cd..5293958 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/LocalDependencyPropagation.s
@@ -16,8 +16,7 @@
 # CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
 # CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _foo }) }
 # CHECK: Simplified dependencies:
-# CHECK-DAG: Symbols: { _foo }, Dependencies: { (main, { _external_func }) }
-# CHECK-DAG: Symbols: { _baz }, Dependencies: { (main, { _external_func }) }
+# CHECK-DAG: Defs: { (main, [ _baz _foo ]) }, Deps: { (main, [ _external_func ]) }
 
         .section	__TEXT,__text,regular,pure_instructions
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index afa3d09..60a4f22 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -22,11 +22,11 @@ define void @store.v4f32.1110(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP6]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.1110(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
@@ -47,11 +47,11 @@ define void @store.v8i32.10010110(ptr %p, <8 x i32> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <8 x i32>, ptr [[P]], i64 0, i64 6
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP8]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr [[P]], i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr align 8 [[P]], <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v8i32.10010110(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr [[P:%.*]], i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> [[ARG:%.*]], ptr align 8 [[P:%.*]], <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
@@ -63,11 +63,11 @@ define void @store.v4i64.0001(ptr %p, <4 x ptr> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x ptr>, ptr [[P:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_store8(i64 [[TMP2]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr [[P]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr align 8 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4i64.0001(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr [[P:%.*]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> [[ARG:%.*]], ptr align 8 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
@@ -108,11 +108,11 @@ define void @store.v4f32.variable(ptr %p, <4 x float> %arg, <4 x i1> %mask) sani
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP19]])
 ; CHECK-NEXT:    br label [[TMP20]]
 ; CHECK:       20:
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.variable(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
@@ -125,11 +125,11 @@ define void @store.v4f32.1010.split(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x float>, ptr [[P:%.*]], i64 0, i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_store4(i64 [[TMP2]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.1010.split(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
@@ -142,12 +142,12 @@ define void @store.v4f32.0010.after.full.store(ptr %p, <4 x float> %arg) sanitiz
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    call void @__asan_store16(i64 [[TMP1]])
 ; CHECK-NEXT:    store <4 x float> [[ARG:%.*]], ptr [[P]], align 16
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @store.v4f32.0010.after.full.store(
 ; DISABLED-NEXT:    store <4 x float> [[ARG:%.*]], ptr [[P:%.*]], align 16
-; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; DISABLED-NEXT:    tail call void @llvm.masked.store.v4f32.p0(<4 x float> [[ARG]], ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
 ; DISABLED-NEXT:    ret void
 ;
   store <4 x float> %arg, ptr %p
@@ -174,11 +174,11 @@ define <8 x i32> @load.v8i32.11100001(ptr %p, <8 x i32> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr <8 x i32>, ptr [[P]], i64 0, i64 7
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP8]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[P]], <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v8i32.11100001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P:%.*]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[P:%.*]], <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
@@ -193,11 +193,11 @@ define <4 x float> @load.v4f32.1001(ptr %p, <4 x float> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP4]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
@@ -209,11 +209,11 @@ define <4 x ptr> @load.v4i64.0001(ptr %p, <4 x ptr> %arg) sanitize_address {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x ptr>, ptr [[P:%.*]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_load8(i64 [[TMP2]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[P]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x ptr> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4i64.0001(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[P:%.*]], i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[P:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x ptr> [[RES]]
 ;
   %res = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
@@ -254,11 +254,11 @@ define <4 x float> @load.v4f32.variable(ptr %p, <4 x float> %arg, <4 x i1> %mask
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP19]])
 ; CHECK-NEXT:    br label [[TMP20]]
 ; CHECK:       20:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[MASK]], <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[MASK]], <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @load.v4f32.variable(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
@@ -271,16 +271,16 @@ define <4 x float> @load.v4f32.1001.split(ptr %p, <4 x float> %arg) sanitize_add
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <4 x float>, ptr [[P:%.*]], i64 0, i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP2]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <4 x float>, ptr [[P]], i64 0, i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64
 ; CHECK-NEXT:    call void @__asan_load4(i64 [[TMP4]])
-; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
+; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
 ; CHECK-NEXT:    ret <4 x float> [[RES2]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001.split(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
-; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[RES]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES2]]
 ;
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
@@ -294,12 +294,12 @@ define <4 x float> @load.v4f32.1001.after.full.load(ptr %p, <4 x float> %arg) sa
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    call void @__asan_load16(i64 [[TMP1]])
 ; CHECK-NEXT:    [[RES:%.*]] = load <4 x float>, ptr [[P]], align 16
-; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; CHECK-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; CHECK-NEXT:    ret <4 x float> [[RES2]]
 ;
 ; DISABLED-LABEL: @load.v4f32.1001.after.full.load(
 ; DISABLED-NEXT:    [[RES:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
+; DISABLED-NEXT:    [[RES2:%.*]] = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> [[ARG:%.*]])
 ; DISABLED-NEXT:    ret <4 x float> [[RES2]]
 ;
   %res = load <4 x float>, ptr %p
@@ -331,11 +331,11 @@ define <vscale x 4 x float> @scalable.load.nxv4f32(ptr %p, <vscale x 4 x i1> %ma
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[P]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[P]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @scalable.load.nxv4f32(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[P:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
 ; DISABLED-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
   %res = tail call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %p, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
@@ -361,11 +361,11 @@ define void @scalable.store.nxv4f32(ptr %p, <vscale x 4 x float> %arg, <vscale x
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr align 4 [[P]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @scalable.store.nxv4f32(
-; DISABLED-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr [[P:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[ARG:%.*]], ptr align 4 [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %arg, ptr %p, i32 4, <vscale x 4 x i1> %mask)
@@ -395,11 +395,11 @@ define <vscale x 4 x float> @scalable.gather.nxv4f32(<vscale x 4 x ptr> %vp, <vs
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[VP]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[VP]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
 ; DISABLED-LABEL: @scalable.gather.nxv4f32(
-; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[VP:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
+; DISABLED-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[VP:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> undef)
 ; DISABLED-NEXT:    ret <vscale x 4 x float> [[RES]]
 ;
   %res = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr>  %vp, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
@@ -425,11 +425,11 @@ define void @scalable.scatter.nxv4f32(<vscale x 4 x float> %val, <vscale x 4 x p
 ; CHECK-NEXT:    [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]]
 ; CHECK:       .split.split:
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[VP]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[VP]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; DISABLED-LABEL: @scalable.scatter.nxv4f32(
-; DISABLED-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[VP:%.*]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; DISABLED-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[VP:%.*]], <vscale x 4 x i1> [[MASK:%.*]])
 ; DISABLED-NEXT:    ret void
 ;
   tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr>  %vp, i32 4, <vscale x 4 x i1> %mask)
diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
index 80d6e0f..76174719 100644
--- a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
+++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
@@ -28,7 +28,7 @@ define void @store.v4f32.1110(<4 x float> %arg) {
 ; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 2
 ; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP2]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP2]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
   ret void
 }
@@ -49,7 +49,7 @@ define void @store.v8i32.10010110(<8 x i32> %arg) {
 ; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, ptr %p, i64 0, i64 6
 ; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP6]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP6]])
-; STORE: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr align 8 %p, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   tail call void @llvm.masked.store.v8i32.p0(<8 x i32> %arg, ptr %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
   ret void
 }
@@ -61,7 +61,7 @@ define void @store.v4i64.0001(<4 x ptr> %arg) {
 ; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x ptr>, ptr %p, i64 0, i64 3
 ; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP3]])
-; STORE: tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; STORE: tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr align 8 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   tail call void @llvm.masked.store.v4p0.p0(<4 x ptr> %arg, ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
   ret void
 }
@@ -105,7 +105,7 @@ define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
 ; STORE: br label %[[AFTER3]]
 ; STORE: [[AFTER3]]:
 
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> %mask)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> %mask)
   ret void
 }
@@ -117,12 +117,12 @@ define void @store.v4f32.1010.split(<4 x float> %arg) {
 ; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 0
 ; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP0]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP0]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 2
 ; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP1]] to i64
 ; STORE: call void @__memprof_store(i64 [[PGEP1]])
-; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+; STORE: tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr align 4 %p, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
   tail call void @llvm.masked.store.v4f32.p0(<4 x float> %arg, ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
@@ -148,7 +148,7 @@ define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) {
 ; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, ptr %p, i64 0, i64 7
 ; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP7]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP7]])
-; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 %p, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0(ptr %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
   ret <8 x i32> %res
 }
@@ -163,7 +163,7 @@ define <4 x float> @load.v4f32.1001(<4 x float> %arg) {
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
   ret <4 x float> %res
 }
@@ -175,7 +175,7 @@ define <4 x ptr> @load.v4i64.0001(<4 x ptr> %arg) {
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x ptr>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
+; LOAD: tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
   %res = tail call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x ptr> %arg)
   ret <4 x ptr> %res
 }
@@ -219,7 +219,7 @@ define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
 ; LOAD: br label %[[AFTER3]]
 ; LOAD: [[AFTER3]]:
 
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> %mask, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
   ret <4 x float> %res
 }
@@ -231,12 +231,12 @@ define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) {
 ; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 0
 ; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP0]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP0]])
-; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
   %res = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
 ; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, ptr %p, i64 0, i64 3
 ; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint ptr [[GEP3]] to i64
 ; LOAD: call void @__memprof_load(i64 [[PGEP3]])
-; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 %p, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
   %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
   ret <4 x float> %res2
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
new file mode 100644
index 0000000..1ddcd4b
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+; Manually minimized to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
new file mode 100644
index 0000000..9caa89d
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
+
+; XFAIL: *
+
+; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+;
+; Test simple loads, stores and return.
+;
+define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+  store target("aarch64.svcount") %val, ptr %ptr
+  ret void
+}
+
+define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+  %ptr = alloca target("aarch64.svcount"), align 1
+  store target("aarch64.svcount") %val, ptr %ptr
+  %res = load target("aarch64.svcount"), ptr %ptr
+  ret target("aarch64.svcount") %res
+}
+
+;
+; Test passing as arguments (from perspective of callee)
+;
+
+define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+  ret target("aarch64.svcount") %arg1
+}
+
+define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+  ret target("aarch64.svcount") %arg4
+}
+
+;
+; Test passing as arguments (from perspective of caller)
+;
+
+declare void @take_svcount_1(target("aarch64.svcount") %arg)
+define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_1(target("aarch64.svcount") %arg)
+  ret void
+}
+
+declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
+define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+  call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
+  ret void
+}
+
+define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
+
+define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+  %cmp = icmp sgt i32 %k, 42
+  %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
+  ret target("aarch64.svcount") %x.y
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 25a4a9a..f0a1791 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -732,7 +732,7 @@ define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 1 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -742,7 +742,7 @@ define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 1 [[PTR]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -776,7 +776,7 @@ define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 1 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -786,7 +786,7 @@ define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 1 [[PTR]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -820,7 +820,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 64 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -830,7 +830,7 @@ define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr align 64 [[PTR]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -864,7 +864,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 64 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -874,7 +874,7 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2,
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr align 64 [[PTR]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -908,7 +908,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 1 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -918,7 +918,7 @@ define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 1 [[PTR1]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -952,7 +952,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i3
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 1 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -962,7 +962,7 @@ define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i3
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 1 [[PTR1]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -996,7 +996,7 @@ define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr align 64 [[TMP9]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
@@ -1006,7 +1006,7 @@ define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64>
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr align 64 [[PTR1]], <8 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -1040,7 +1040,7 @@ define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr align 64 [[TMP9]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
@@ -1050,7 +1050,7 @@ define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr align 64 [[PTR1]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -1093,7 +1093,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1103,13 +1103,13 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1119,7 +1119,7 @@ define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1156,7 +1156,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1166,13 +1166,13 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1182,7 +1182,7 @@ define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1219,7 +1219,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1229,13 +1229,13 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1245,7 +1245,7 @@ define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1282,7 +1282,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1292,13 +1292,13 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1308,7 +1308,7 @@ define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1348,7 +1348,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP14]], <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0
@@ -1358,13 +1358,13 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR2]], <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[TMP23]], <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0
@@ -1374,7 +1374,7 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 1 [[PTR]], <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1412,7 +1412,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP14]], <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0
@@ -1422,13 +1422,13 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR2]], <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[TMP23]], <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0
@@ -1438,7 +1438,7 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 1 [[PTR]], <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1475,7 +1475,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP13]], <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
@@ -1485,13 +1485,13 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[TMP22]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
@@ -1501,7 +1501,7 @@ define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mas
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 64 [[PTR]], <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
@@ -1538,7 +1538,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP13]], <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
@@ -1548,13 +1548,13 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
 ; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[TMP22]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
@@ -1564,7 +1564,7 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr align 64 [[PTR]], <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
 ; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]]
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index cc022e9..0362438 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -2031,7 +2031,7 @@ define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr align 1 [[TMP14]], <4 x i1> [[EXTRACT]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0
@@ -2041,7 +2041,7 @@ define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr align 1 [[PTR]], <4 x i1> [[EXTRACT]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = and i8 %mask, 1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index dbef575..fd4ad96 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -197,7 +197,7 @@ define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP2]], ptr [[TMP9]], i32 1, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP2]], ptr align 1 [[TMP9]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <64 x i1> [[TMP5]] to i64
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP10]], 0
@@ -207,7 +207,7 @@ define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[X1:%.*]], ptr [[PTR1]], i32 1, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[X1:%.*]], ptr align 1 [[PTR1]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -240,7 +240,7 @@ define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[TMP2]], ptr [[TMP9]], i32 1, <32 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[TMP2]], ptr align 1 [[TMP9]], <32 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP10]], 0
@@ -250,7 +250,7 @@ define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[X1:%.*]], ptr [[PTR1]], i32 1, <32 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i16.p0(<32 x i16> [[X1:%.*]], ptr align 1 [[PTR1]], <32 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
 ; CHECK:       13:
@@ -293,7 +293,7 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP14]], i32 1, <32 x i1> [[TMP11]], <32 x i16> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[TMP14]], <32 x i1> [[TMP11]], <32 x i16> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i1> [[TMP10]] to i32
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[TMP15]], 0
@@ -303,13 +303,13 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[PTR2]], i32 1, <32 x i1> [[TMP11]], <32 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[PTR2]], <32 x i1> [[TMP11]], <32 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i32 [[MASK]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP23]], i32 1, <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[TMP23]], <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <32 x i1> [[TMP19]] to i32
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i32 [[TMP24]], 0
@@ -319,7 +319,7 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[PTR]], i32 1, <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 1 [[PTR]], <32 x i1> [[TMP20]], <32 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } { <32 x i16> splat (i16 -1), <32 x i16> splat (i16 -1), <32 x i16> splat (i16 -1) }, <32 x i16> [[_MSLD]], 0
 ; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } poison, <32 x i16> [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <32 x i16>, <32 x i16>, <32 x i16> } [[TMP28]], <32 x i16> [[_MSMASKEDLD]], 1
@@ -362,7 +362,7 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP14]], i32 1, <64 x i1> [[TMP11]], <64 x i8> [[_MSLD]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP14]], <64 x i1> [[TMP11]], <64 x i8> [[_MSLD]])
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i1> [[TMP10]] to i64
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP15]], 0
@@ -372,13 +372,13 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[PTR2]], i32 1, <64 x i1> [[TMP11]], <64 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[PTR2]], <64 x i1> [[TMP11]], <64 x i8> [[TMP6]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i64 [[TMP2]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i64 [[MASK]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP23]], i32 1, <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP23]], <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <64 x i1> [[TMP19]] to i64
 ; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP24]], 0
@@ -388,7 +388,7 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[PTR]], i32 1, <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP27:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[PTR]], <64 x i1> [[TMP20]], <64 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSLD]], 0
 ; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP6]], 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP28]], <64 x i8> [[_MSMASKEDLD]], 1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
index 3ac6844..77b4836 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
@@ -24,8 +24,8 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP3]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP3]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @Store(
@@ -37,7 +37,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; ADDR-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP5]], i32 1, <4 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP5]], <4 x i1> [[MASK:%.*]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; ADDR-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[TMP2]] to i4
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
@@ -47,7 +47,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       8:
-; ADDR-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @Store(
@@ -61,7 +61,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr [[TMP4]], i32 1, <4 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP0]], ptr align 1 [[TMP4]], <4 x i1> [[MASK:%.*]])
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP7]], align 4
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 1
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP8]], align 4
@@ -77,7 +77,7 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP13]], align 4
 ; ORIGINS-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP7]], i32 7
 ; ORIGINS-NEXT:    store i32 [[TMP1]], ptr [[TMP14]], align 4
-; ORIGINS-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
@@ -93,8 +93,8 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP3]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
-; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP3]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
+; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[X]]
 ;
@@ -107,7 +107,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; ADDR-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ADDR-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP5]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP2]])
+; ADDR-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP5]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP2]])
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
 ; ADDR-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[TMP1]] to i4
 ; ADDR-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
@@ -117,7 +117,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       8:
-; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; ADDR-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <4 x double> [[X]]
 ;
@@ -132,7 +132,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], -4
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; ORIGINS-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[TMP4]], i32 1, <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
+; ORIGINS-NEXT:    [[_MSMASKEDLD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[TMP4]], <4 x i1> [[MASK:%.*]], <4 x i64> [[TMP0]])
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = sub <4 x i1> zeroinitializer, [[MASK]]
 ; ORIGINS-NEXT:    [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i64>
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = and <4 x i64> [[TMP0]], [[TMP9]]
@@ -140,7 +140,7 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ORIGINS-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
 ; ORIGINS-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4
 ; ORIGINS-NEXT:    [[TMP13:%.*]] = select i1 [[_MSCMP]], i32 [[TMP1]], i32 [[TMP12]]
-; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P]], i32 1, <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
+; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P]], <4 x i1> [[MASK]], <4 x double> [[V:%.*]])
 ; ORIGINS-NEXT:    store <4 x i64> [[_MSMASKEDLD]], ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 [[TMP13]], ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
@@ -157,8 +157,8 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080
 ; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @StoreNoSanitize(
@@ -167,8 +167,8 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ADDR-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080
 ; ADDR-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr
-; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
-; ADDR-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
+; ADDR-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @StoreNoSanitize(
@@ -180,7 +180,7 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP2]], i32 1, <4 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP2]], <4 x i1> [[MASK:%.*]])
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP5]], align 4
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 1
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP6]], align 4
@@ -196,7 +196,7 @@ define void @StoreNoSanitize(ptr %p, <4 x i64> %v, <4 x i1> %mask) {
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP11]], align 4
 ; ORIGINS-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP5]], i32 7
 ; ORIGINS-NEXT:    store i32 0, ptr [[TMP12]], align 4
-; ORIGINS-NEXT:    tail call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[P]], i32 1, <4 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[P]], <4 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
 entry:
@@ -208,21 +208,21 @@ define <4 x double> @LoadNoSanitize(ptr %p, <4 x double> %v, <4 x i1> %mask) {
 ; CHECK-LABEL: @LoadNoSanitize(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; CHECK-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[X]]
 ;
 ; ADDR-LABEL: @LoadNoSanitize(
 ; ADDR-NEXT:  entry:
 ; ADDR-NEXT:    call void @llvm.donothing()
-; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; ADDR-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; ADDR-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <4 x double> [[X]]
 ;
 ; ORIGINS-LABEL: @LoadNoSanitize(
 ; ORIGINS-NEXT:  entry:
 ; ORIGINS-NEXT:    call void @llvm.donothing()
-; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[P:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
+; ORIGINS-NEXT:    [[X:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[P:%.*]], <4 x i1> [[MASK:%.*]], <4 x double> [[V:%.*]])
 ; ORIGINS-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <4 x double> [[X]]
@@ -240,8 +240,8 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i64> [[TMP2]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <16 x i64> [[TMP3]] to <16 x ptr>
-; CHECK-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP4]], i32 4, <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP4]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -254,7 +254,7 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor <16 x i64> [[TMP4]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr <16 x i64> [[TMP5]] to <16 x ptr>
-; ADDR-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP6]], i32 4, <16 x i1> [[MASK]], <16 x i32> [[TMP3]])
+; ADDR-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP6]], <16 x i1> [[MASK]], <16 x i32> [[TMP3]])
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP1]] to i16
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i16 [[TMP7]], 0
 ; ADDR-NEXT:    [[TMP8:%.*]] = bitcast <16 x i64> [[_MSMASKEDPTRS]] to i1024
@@ -265,7 +265,7 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       10:
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
@@ -278,8 +278,8 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <16 x i64> [[TMP4]] to <16 x ptr>
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add <16 x i64> [[TMP4]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr <16 x i64> [[TMP6]] to <16 x ptr>
-; ORIGINS-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP5]], i32 4, <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[_MSMASKEDGATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP5]], <16 x i1> [[MASK:%.*]], <16 x i32> [[TMP1]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> [[_MSMASKEDGATHER]], ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
@@ -291,20 +291,20 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 define <16 x float> @GatherNoSanitize(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %passthru) {
 ; CHECK-LABEL: @GatherNoSanitize(
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ADDR-LABEL: @GatherNoSanitize(
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[_MSMASKEDPTRS:%.*]] = select <16 x i1> [[MASK:%.*]], <16 x i64> zeroinitializer, <16 x i64> zeroinitializer
-; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
+; ADDR-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK]], <16 x float> [[PASSTHRU:%.*]])
 ; ADDR-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ORIGINS-LABEL: @GatherNoSanitize(
 ; ORIGINS-NEXT:    call void @llvm.donothing()
-; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[PTRS:%.*]], i32 4, <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
+; ORIGINS-NEXT:    [[RET:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[PTRS:%.*]], <16 x i1> [[MASK:%.*]], <16 x float> [[PASSTHRU:%.*]])
 ; ORIGINS-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; ORIGINS-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
 ; ORIGINS-NEXT:    ret <16 x float> [[RET]]
@@ -321,8 +321,8 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr <8 x i64> [[TMP3]] to <8 x ptr>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> [[TMP4]], i32 8, <8 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> align 8 [[TMP4]], <8 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @Scatter(
@@ -334,7 +334,7 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP6:%.*]] = inttoptr <8 x i64> [[TMP5]] to <8 x ptr>
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP3]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP3]], <8 x ptr> align 8 [[TMP6]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
 ; ADDR-NEXT:    [[_MSCMP:%.*]] = icmp ne i8 [[TMP7]], 0
 ; ADDR-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSMASKEDPTRS]] to i512
@@ -345,7 +345,7 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ADDR-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
 ; ADDR-NEXT:    unreachable
 ; ADDR:       10:
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @Scatter(
@@ -357,8 +357,8 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <8 x i64> [[TMP4]] to <8 x ptr>
 ; ORIGINS-NEXT:    [[TMP6:%.*]] = add <8 x i64> [[TMP4]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP7:%.*]] = inttoptr <8 x i64> [[TMP6]] to <8 x ptr>
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP1]], <8 x ptr> align 8 [[TMP5]], <8 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
@@ -371,8 +371,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 87960930222080)
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @ScatterNoSanitize(
@@ -381,8 +381,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; ADDR-NEXT:    [[TMP1:%.*]] = ptrtoint <8 x ptr> [[PTRS:%.*]] to <8 x i64>
 ; ADDR-NEXT:    [[TMP2:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 87960930222080)
 ; ADDR-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK]])
-; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK]])
+; ADDR-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ADDR-NEXT:    ret void
 ;
 ; ORIGINS-LABEL: @ScatterNoSanitize(
@@ -392,8 +392,8 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = inttoptr <8 x i64> [[TMP2]] to <8 x ptr>
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = add <8 x i64> [[TMP2]], splat (i64 17592186044416)
 ; ORIGINS-NEXT:    [[TMP5:%.*]] = inttoptr <8 x i64> [[TMP4]] to <8 x ptr>
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> [[TMP3]], i32 8, <8 x i1> [[MASK:%.*]])
-; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> [[PTRS]], i32 8, <8 x i1> [[MASK]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> zeroinitializer, <8 x ptr> align 8 [[TMP3]], <8 x i1> [[MASK:%.*]])
+; ORIGINS-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VALUE:%.*]], <8 x ptr> align 8 [[PTRS]], <8 x i1> [[MASK]])
 ; ORIGINS-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %value, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask)
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index 0c23e3b..7aaf3a0 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -73,6 +73,8 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 
 // All entries should be emitted in Libcall enum.
 // CHECK: #ifdef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-NEXT: #undef GET_RUNTIME_LIBCALL_ENUM
+// CHECK-EMPTY:
 // CHECK-NEXT: namespace llvm {
 // CHECK-NEXT: namespace RTLIB {
 // CHECK-NEXT: enum Libcall : unsigned short {
@@ -101,9 +103,12 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: constexpr size_t NumLibcallImpls = 9;
 // CHECK-NEXT: } // End namespace RTLIB
 // CHECK-NEXT: } // End namespace llvm
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_RUNTIME_LIBCALL_ENUM
 
 // CHECK: #ifdef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-NEXT: #undef GET_INIT_RUNTIME_LIBCALL_NAMES
+// CHECK-EMPTY:
 // CHECK-EMPTY:
 // CHECK-NEXT: #ifdef __GNUC__
 // CHECK-NEXT: #pragma GCC diagnostic push
@@ -163,13 +168,18 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
 // CHECK-NEXT: };
 
 // CHECK: #ifdef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-NEXT: #undef GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
+// CHECK-EMPTY:
 // CHECK-NEXT: size_t Size = Name.size();
 // CHECK-NEXT: if (Size == 0 || Size > 9)
 // CHECK-NEXT:   return enum_seq(RTLIB::Unsupported, RTLIB::Unsupported);
 // CHECK-NEXT: return lookupLibcallImplNameImpl(Name);
-// CHECK-NEXT: #endif
+// CHECK-EMPTY:
+// CHECK-NEXT: #endif // GET_LOOKUP_LIBCALL_IMPL_NAME_BODY
 
 // CHECK: #ifdef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-NEXT: #undef DEFINE_GET_LOOKUP_LIBCALL_IMPL_NAME
+// CHECK-EMPTY:
 // CHECK-NEXT: static inline uint64_t hash(StringRef Str) {
 // CHECK-NEXT: return static_cast<uint32_t>(xxh3_64bits(Str));
 // CHECK-NEXT: }
diff --git a/llvm/test/Transforms/Attributor/readattrs.ll b/llvm/test/Transforms/Attributor/readattrs.ll
index 30cb085..1993472 100644
--- a/llvm/test/Transforms/Attributor/readattrs.ll
+++ b/llvm/test/Transforms/Attributor/readattrs.ll
@@ -137,13 +137,13 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; TUNIT-LABEL: define {{[^@]+}}@test9
 ; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; TUNIT-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR16:[0-9]+]]
+; TUNIT-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR16:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; CGSCC-LABEL: define {{[^@]+}}@test9
 ; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR18:[0-9]+]]
+; CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>) #[[ATTR18:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
@@ -154,14 +154,14 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@test10
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
-; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR17:[0-9]+]]
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR5:[0-9]+]] {
+; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR17:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@test10
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
-; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 noundef 4, <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR19:[0-9]+]]
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR6:[0-9]+]] {
+; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> noundef <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR19:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
@@ -172,14 +172,14 @@ declare <4 x i32> @test11_1(<4 x ptr>) argmemonly nounwind readonly
 define <4 x i32> @test11_2(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: nosync nounwind memory(argmem: read)
 ; TUNIT-LABEL: define {{[^@]+}}@test11_2
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
-; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR15:[0-9]+]]
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
+; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR13:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: nosync nounwind memory(argmem: read)
 ; CGSCC-LABEL: define {{[^@]+}}@test11_2
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
-; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR16:[0-9]+]]
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
+; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR14:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @test11_1(<4 x ptr> %ptrs)
@@ -191,13 +191,13 @@ declare <4 x i32> @test12_1(<4 x ptr>) argmemonly nounwind
 define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 ; TUNIT: Function Attrs: nounwind memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test12_2
-; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; TUNIT-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; TUNIT-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]]) #[[ATTR18:[0-9]+]]
 ; TUNIT-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; CGSCC: Function Attrs: nounwind memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test12_2
-; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]]) #[[ATTR20:[0-9]+]]
 ; CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -208,13 +208,13 @@ define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 define i32 @volatile_load(ptr %p) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@volatile_load
-; TUNIT-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; TUNIT-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; TUNIT-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; TUNIT-NEXT:    ret i32 [[LOAD]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@volatile_load
-; CGSCC-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR12:[0-9]+]] {
+; CGSCC-SAME: (ptr nofree noundef align 4 [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CGSCC-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; CGSCC-NEXT:    ret i32 [[LOAD]]
 ;
@@ -292,13 +292,13 @@ define void @byval_not_readonly_2(ptr byval(i8) %written) readonly {
 define void @byval_not_readnone_1(ptr byval(i8) %written) readnone {
 ; TUNIT: Function Attrs: nosync memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@byval_not_readnone_1
-; TUNIT-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR12:[0-9]+]] {
+; TUNIT-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR10:[0-9]+]] {
 ; TUNIT-NEXT:    call void @escape_i8(ptr nonnull dereferenceable(1) [[WRITTEN]])
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@byval_not_readnone_1
-; CGSCC-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR13:[0-9]+]] {
+; CGSCC-SAME: (ptr noalias nonnull byval(i8) dereferenceable(1) [[WRITTEN:%.*]]) #[[ATTR11:[0-9]+]] {
 ; CGSCC-NEXT:    call void @escape_i8(ptr nonnull dereferenceable(1) [[WRITTEN]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -331,17 +331,17 @@ define void @byval_no_fnarg(ptr byval(i8) %written) {
 define void @testbyval(ptr %read_only) {
 ; TUNIT: Function Attrs: nosync
 ; TUNIT-LABEL: define {{[^@]+}}@testbyval
-; TUNIT-SAME: (ptr nonnull readonly captures(none) [[READ_ONLY:%.*]]) #[[ATTR13:[0-9]+]] {
+; TUNIT-SAME: (ptr nonnull readonly captures(none) [[READ_ONLY:%.*]]) #[[ATTR11:[0-9]+]] {
 ; TUNIT-NEXT:    call void @byval_not_readonly_1(ptr noalias nonnull readonly byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR3]]
-; TUNIT-NEXT:    call void @byval_not_readnone_1(ptr noalias nonnull readnone byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR13]]
+; TUNIT-NEXT:    call void @byval_not_readnone_1(ptr noalias nonnull readnone byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR11]]
 ; TUNIT-NEXT:    call void @byval_no_fnarg(ptr noalias nofree noundef nonnull readonly byval(i8) captures(none) [[READ_ONLY]]) #[[ATTR19:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync
 ; CGSCC-LABEL: define {{[^@]+}}@testbyval
-; CGSCC-SAME: (ptr noundef nonnull readonly captures(none) dereferenceable(1) [[READ_ONLY:%.*]]) #[[ATTR14:[0-9]+]] {
+; CGSCC-SAME: (ptr noundef nonnull readonly captures(none) dereferenceable(1) [[READ_ONLY:%.*]]) #[[ATTR12:[0-9]+]] {
 ; CGSCC-NEXT:    call void @byval_not_readonly_1(ptr noalias noundef nonnull readonly byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR2:[0-9]+]]
-; CGSCC-NEXT:    call void @byval_not_readnone_1(ptr noalias noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR14]]
+; CGSCC-NEXT:    call void @byval_not_readnone_1(ptr noalias noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR12]]
 ; CGSCC-NEXT:    call void @byval_no_fnarg(ptr noalias nofree noundef nonnull readnone byval(i8) captures(none) dereferenceable(1) [[READ_ONLY]]) #[[ATTR21:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
@@ -361,16 +361,16 @@ declare void @val_use(i8 %ptr) readonly nounwind
 define void @ptr_uses(ptr %ptr) {
 ; TUNIT: Function Attrs: nosync nounwind memory(read)
 ; TUNIT-LABEL: define {{[^@]+}}@ptr_uses
-; TUNIT-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR15]] {
-; TUNIT-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR15]]
-; TUNIT-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR15]]
+; TUNIT-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR13]] {
+; TUNIT-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR13]]
+; TUNIT-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR13]]
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: nosync nounwind memory(read)
 ; CGSCC-LABEL: define {{[^@]+}}@ptr_uses
-; CGSCC-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR16]] {
-; CGSCC-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR16]]
-; CGSCC-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR16]]
+; CGSCC-SAME: (ptr nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR14]] {
+; CGSCC-NEXT:    [[CALL_PTR:%.*]] = call ptr @maybe_returned_ptr(ptr nofree readonly [[PTR]]) #[[ATTR14]]
+; CGSCC-NEXT:    [[CALL_VAL:%.*]] = call i8 @maybe_returned_val(ptr readonly [[CALL_PTR]]) #[[ATTR14]]
 ; CGSCC-NEXT:    ret void
 ;
   %call_ptr = call ptr @maybe_returned_ptr(ptr %ptr)
@@ -406,17 +406,17 @@ define i32 @read_only_constant_mem() {
 ; TUNIT: attributes #[[ATTR2:[0-9]+]] = { memory(read) }
 ; TUNIT: attributes #[[ATTR3]] = { nosync memory(read) }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
-; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
-; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR7]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR9]] = { nosync nounwind memory(argmem: read) }
-; TUNIT: attributes #[[ATTR10]] = { nounwind memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR11]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
-; TUNIT: attributes #[[ATTR12]] = { nosync memory(none) }
-; TUNIT: attributes #[[ATTR13]] = { nosync }
-; TUNIT: attributes #[[ATTR14:[0-9]+]] = { nounwind memory(read) }
-; TUNIT: attributes #[[ATTR15]] = { nosync nounwind memory(read) }
+; TUNIT: attributes #[[ATTR5]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nounwind memory(argmem: read) }
+; TUNIT: attributes #[[ATTR7]] = { nosync nounwind memory(argmem: read) }
+; TUNIT: attributes #[[ATTR8]] = { nounwind memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR9]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR10]] = { nosync memory(none) }
+; TUNIT: attributes #[[ATTR11]] = { nosync }
+; TUNIT: attributes #[[ATTR12:[0-9]+]] = { nounwind memory(read) }
+; TUNIT: attributes #[[ATTR13]] = { nosync nounwind memory(read) }
+; TUNIT: attributes #[[ATTR14:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; TUNIT: attributes #[[ATTR15:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
 ; TUNIT: attributes #[[ATTR16]] = { nofree willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR17]] = { nofree willreturn memory(read) }
 ; TUNIT: attributes #[[ATTR18]] = { nounwind }
@@ -428,17 +428,17 @@ define i32 @read_only_constant_mem() {
 ; CGSCC: attributes #[[ATTR3]] = { nosync memory(read) }
 ; CGSCC: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
-; CGSCC: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR8]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
-; CGSCC: attributes #[[ATTR9:[0-9]+]] = { nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR10]] = { nosync nounwind memory(argmem: read) }
-; CGSCC: attributes #[[ATTR11]] = { nounwind memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR12]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
-; CGSCC: attributes #[[ATTR13]] = { nosync memory(none) }
-; CGSCC: attributes #[[ATTR14]] = { nosync }
-; CGSCC: attributes #[[ATTR15:[0-9]+]] = { nounwind memory(read) }
-; CGSCC: attributes #[[ATTR16]] = { nosync nounwind memory(read) }
+; CGSCC: attributes #[[ATTR6]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
+; CGSCC: attributes #[[ATTR7:[0-9]+]] = { nounwind memory(argmem: read) }
+; CGSCC: attributes #[[ATTR8]] = { nosync nounwind memory(argmem: read) }
+; CGSCC: attributes #[[ATTR9]] = { nounwind memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR10]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR11]] = { nosync memory(none) }
+; CGSCC: attributes #[[ATTR12]] = { nosync }
+; CGSCC: attributes #[[ATTR13:[0-9]+]] = { nounwind memory(read) }
+; CGSCC: attributes #[[ATTR14]] = { nosync nounwind memory(read) }
+; CGSCC: attributes #[[ATTR15:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(write) }
+; CGSCC: attributes #[[ATTR16:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR17]] = { nofree nosync willreturn }
 ; CGSCC: attributes #[[ATTR18]] = { nofree willreturn memory(write) }
 ; CGSCC: attributes #[[ATTR19]] = { nofree willreturn memory(read) }
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
index 26de31d6..b0cd589 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/dont-sink-scalable-vector-compare.ll
@@ -13,9 +13,9 @@ define void @do_not_sink_scalable_vector_compare(ptr %a, ptr %b) #0 {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[SRC:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[SRC]], i32 4, <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[SRC]], <vscale x 4 x i1> [[TMP0]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DST:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr [[DST]], i32 4, <vscale x 4 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_LOAD]], ptr align 4 [[DST]], <vscale x 4 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[VECTOR_END:.*]], label %[[VECTOR_BODY]]
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
index 3c5c07f..0c48d0c 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt-inseltpoison.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @splat_base(ptr %base, <vscale x 4 x i64> %index, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <vscale x 4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %base, i32 0
@@ -24,7 +24,7 @@ define <vscale x 4 x i32> @splat_struct(ptr %base, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <vscale x 4 x i64> zeroinitializer, i32 1
@@ -36,7 +36,7 @@ define <vscale x 4 x i32> @scalar_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %base, i32 0
@@ -50,7 +50,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i64> poison, i64 %index, i32 0
@@ -63,7 +63,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <vscale x 4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <vscale x 4 x i64> %indxs
@@ -73,7 +73,7 @@ define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale
 
 define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr @c, i32 0
@@ -86,7 +86,7 @@ define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) #0 {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr %ptr, i32 0
@@ -98,7 +98,7 @@ define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <
 define void @splat_ptr_scatter(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %val) #0 {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <vscale x 4 x ptr> poison, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
index 36cd69e..c82a541 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/gather-scatter-opt.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @splat_base(ptr %base, <vscale x 4 x i64> %index, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <vscale x 4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> undef, ptr %base, i32 0
@@ -24,7 +24,7 @@ define <vscale x 4 x i32> @splat_struct(ptr %base, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <vscale x 4 x i64> zeroinitializer, i32 1
@@ -36,7 +36,7 @@ define <vscale x 4 x i32> @scalar_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> undef, ptr %base, i32 0
@@ -50,7 +50,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP2]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP2]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <vscale x 4 x i64> undef, i64 %index, i32 0
@@ -63,7 +63,7 @@ define <vscale x 4 x i32> @splat_index(ptr %base, i64 %index, <vscale x 4 x i1>
 define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <vscale x 4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <vscale x 4 x i64> %indxs
@@ -73,7 +73,7 @@ define <vscale x 4 x i32> @test_global_array(<vscale x 4 x i64> %indxs, <vscale
 
 define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> undef)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr @c, i32 0
@@ -86,7 +86,7 @@ define <vscale x 4 x i32> @global_struct_splat(<vscale x 4 x i1> %mask) #0 {
 define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) #0 {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr %ptr, i32 0
@@ -98,7 +98,7 @@ define <vscale x 4 x i32> @splat_ptr_gather(ptr %ptr, <vscale x 4 x i1> %mask, <
 define void @splat_ptr_scatter(ptr %ptr, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %val) #0 {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <vscale x 4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <vscale x 4 x ptr> undef, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
index f170c8f..9c65653 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-gather-scatter-addressing.ll
@@ -11,8 +11,8 @@ define <vscale x 4 x float> @gather_offsets_sink_gep(ptr %base, <vscale x 4 x i3
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -38,8 +38,8 @@ define <vscale x 4 x float> @gather_offsets_sink_sext(ptr %base, <vscale x 4 x i
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -66,8 +66,8 @@ define <vscale x 4 x float> @gather_offsets_sink_sext_get(ptr %base, <vscale x 4
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -93,8 +93,8 @@ define <vscale x 4 x float> @gather_no_scalar_base(<vscale x 4 x ptr> %bases, <v
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, <vscale x 4 x ptr> [[BASES]], <vscale x 4 x i32> [[INDICES]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -121,8 +121,8 @@ define <vscale x 4 x float> @gather_offset_type_too_small(ptr %base, <vscale x 4
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i32> [[INDICES_SEXT]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -150,8 +150,8 @@ define <vscale x 4 x float> @gather_offset_type_too_big(ptr %base, <vscale x 4 x
 ; CHECK-NEXT:    br i1 [[COND]], label [[COND_BLOCK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[INDICES_SEXT]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP0]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP0]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -181,8 +181,8 @@ define <vscale x 4 x float> @gather_offset_sink_zext(ptr %base, <vscale x 4 x i8
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <vscale x 4 x i8> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    [[LOAD:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[PTRS]], i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD]]
+; CHECK-NEXT:    [[RET:%.*]] = tail call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[PTRS]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[RET]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret <vscale x 4 x float> zeroinitializer
 ;
@@ -209,7 +209,7 @@ define void @scatter_offsets_sink_sext_get(<vscale x 4 x float> %data, ptr %base
 ; CHECK:       cond.block:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <vscale x 4 x i32> [[INDICES]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr [[BASE]], <vscale x 4 x i64> [[TMP0]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[DATA]], <vscale x 4 x ptr> [[TMP1]], i32 4, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[DATA]], <vscale x 4 x ptr> align 4 [[TMP1]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
index e27d5d7..136f6aa 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt-inseltpoison.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x i32> @splat_base(ptr %base, <4 x i64> %index) {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> poison, ptr %base, i32 0
@@ -27,7 +27,7 @@ define <4 x i32> @splat_struct(ptr %base) {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <4 x i64> zeroinitializer, i32 1
@@ -39,7 +39,7 @@ define <4 x i32> @scalar_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> poison, ptr %base, i32 0
@@ -53,7 +53,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
@@ -66,7 +66,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <4 x i64> %indxs
@@ -76,7 +76,7 @@ define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 
 define <4 x i32> @global_struct_splat() {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x ptr> poison, ptr @c, i32 0
@@ -89,7 +89,7 @@ define <4 x i32> @global_struct_splat() {
 define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = insertelement <4 x ptr> poison, ptr %ptr, i32 0
@@ -101,7 +101,7 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <4 x ptr> poison, ptr %ptr, i32 0
@@ -113,7 +113,7 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 define <4 x i32> @scalar_prefix(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix(
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], i64 [[INDEX:%.*]], <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, i64 %index, <4 x i64> %vecidx
@@ -126,7 +126,7 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
@@ -140,7 +140,7 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i64 %index, <4 x i64> %vec
 define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @scalar_prefix_with_constant_splat(
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> splat (i64 20), <4 x i64> [[VECIDX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, <4 x i64> splat (i64 20), <4 x i64> %vecidx
@@ -151,7 +151,7 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i64> %vecidx
 define <4 x i32> @reassociate(ptr %base, i64 %index, <4 x i64> %vecidx) {
 ; CHECK-LABEL: @reassociate(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], i64 [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[GEP]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr [256 x i32], ptr %base, <4 x i64> %vecidx, i64 %index
@@ -164,7 +164,7 @@ define <4 x i32> @reassociate_with_splat(ptr %base, i64 %index, <4 x i64> %vecid
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX:%.*]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [256 x i32], ptr [[BASE:%.*]], <4 x i64> [[VECIDX:%.*]], <4 x i64> [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[GEP]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[GEP]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> poison, i64 %index, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
index 8328708..22a007d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
@@ -12,7 +12,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x i32> @splat_base(ptr %base, <4 x i64> %index) {
 ; CHECK-LABEL: @splat_base(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> undef, ptr %base, i32 0
@@ -26,7 +26,7 @@ define <4 x i32> @splat_struct(ptr %base) {
 ; CHECK-LABEL: @splat_struct(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[BASE:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %gep = getelementptr %struct.a, ptr %base, <4 x i64> zeroinitializer, i32 1
@@ -38,7 +38,7 @@ define <4 x i32> @scalar_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @scalar_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x ptr> undef, ptr %base, i32 0
@@ -52,7 +52,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 ; CHECK-LABEL: @splat_index(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 [[INDEX:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
@@ -65,7 +65,7 @@ define <4 x i32> @splat_index(ptr %base, i64 %index) {
 define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 ; CHECK-LABEL: @test_global_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr @glob_array, <4 x i64> [[INDXS:%.*]]
-; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[G]]
 ;
   %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <4 x i64> %indxs
@@ -75,7 +75,7 @@ define <4 x i32> @test_global_array(<4 x i64> %indxs) {
 
 define <4 x i32> @global_struct_splat() {
 ; CHECK-LABEL: @global_struct_splat(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, i32 4, <4 x i1> splat (i1 true), <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 <ptr getelementptr ([[STRUCT_A:%.*]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1), ptr getelementptr ([[STRUCT_A]], ptr @c, i64 0, i32 1)>, <4 x i1> splat (i1 true), <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %1 = insertelement <4 x ptr> undef, ptr @c, i32 0
@@ -88,7 +88,7 @@ define <4 x i32> @global_struct_splat() {
 define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) {
 ; CHECK-LABEL: @splat_ptr_gather(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
@@ -100,7 +100,7 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
 define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
 ; CHECK-LABEL: @splat_ptr_scatter(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], <4 x i64> zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[TMP1]], <4 x i1> [[MASK:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
index dbd5e87..220d9f8 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/masked-gather-struct-gep.ll
@@ -12,7 +12,7 @@ define <4 x float> @foo(ptr %p) {
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr [[P]] to ptr
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP2]], i32 0, <4 x i1> zeroinitializer, <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> zeroinitializer, <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[GATHER]]
 ;
   %base.splatinsert = insertelement <4 x ptr> poison, ptr %p, i32 0
diff --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
index 0b4d657..1ff15d0 100644
--- a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store-inseltpoison.ll
@@ -6,24 +6,24 @@ define dllexport i32 @f0(ptr %a0, ptr %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[V0:%.*]] = getelementptr inbounds ptr, ptr [[A0:%.*]], i32 [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[V0]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[V2:%.*]] = getelementptr i8, ptr [[V1]], i32 [[A3:%.*]]
 ; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds ptr, ptr [[A1:%.*]], i32 [[A4:%.*]]
-; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, [[TBAA3:!tbaa !.*]]
+; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[V8:%.*]] = getelementptr i8, ptr [[V7]], i32 [[A5:%.*]]
-; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V8]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5:!tbaa !.*]]
+; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V8]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V17:%.*]] = getelementptr inbounds ptr, ptr [[A1]], i32 [[A6:%.*]]
-; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, [[TBAA3]]
+; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[V19:%.*]] = getelementptr i8, ptr [[V18]], i32 [[A7:%.*]]
-; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V19]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), [[TBAA5]]
+; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V19]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]]
 ; CHECK-NEXT:    [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]]
 ; CHECK-NEXT:    [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr [[V2]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), [[TBAA8:!tbaa !.*]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr align 32 [[V2]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 b0:
diff --git a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
index 7169d9f..54c51eb 100644
--- a/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/masked-dead-store.ll
@@ -11,19 +11,19 @@ define dllexport i32 @f0(ptr %a0, ptr %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i
 ; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds ptr, ptr [[A1:%.*]], i32 [[A4:%.*]]
 ; CHECK-NEXT:    [[V7:%.*]] = load ptr, ptr [[V6]], align 4, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[V8:%.*]] = getelementptr i8, ptr [[V7]], i32 [[A5:%.*]]
-; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V8]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[V10:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V8]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[V11:%.*]] = shufflevector <128 x i8> [[V10]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V14:%.*]] = shufflevector <32 x i8> [[V11]], <32 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[V16:%.*]] = shufflevector <128 x i8> [[V14]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V17:%.*]] = getelementptr inbounds ptr, ptr [[A1]], i32 [[A6:%.*]]
 ; CHECK-NEXT:    [[V18:%.*]] = load ptr, ptr [[V17]], align 4, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[V19:%.*]] = getelementptr i8, ptr [[V18]], i32 [[A7:%.*]]
-; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[V19]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
+; CHECK-NEXT:    [[V21:%.*]] = tail call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 32 [[V19]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <128 x i8> undef), !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[V22:%.*]] = shufflevector <128 x i8> [[V21]], <128 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[V23:%.*]] = icmp ugt <32 x i8> [[V16]], [[V22]]
 ; CHECK-NEXT:    [[V24:%.*]] = select <32 x i1> [[V23]], <32 x i8> [[V16]], <32 x i8> [[V22]]
 ; CHECK-NEXT:    [[V25:%.*]] = shufflevector <32 x i8> [[V24]], <32 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr [[V2]], i32 32, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.v128i8.p0(<128 x i8> [[V25]], ptr align 32 [[V2]], <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>), !tbaa [[TBAA8:![0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 b0:
@@ -53,8 +53,8 @@ b0:
 
 define dllexport i32 @f1(ptr %a, <4 x i8> %v1, <4 x i32> %v2) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr [[A:%.*]], i32 1, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr [[A]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr align 1 [[A:%.*]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr align 1 [[A]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret i32 0
 ;
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %v2, ptr %a, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
@@ -64,8 +64,8 @@ define dllexport i32 @f1(ptr %a, <4 x i8> %v1, <4 x i32> %v2) {
 
 define dllexport i32 @f2(ptr %a, <4 x i8> %v1, <4 x i32> %v2, <4 x i1> %mask) {
 ; CHECK-LABEL: @f2(
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr [[A:%.*]], i32 1, <4 x i1> [[MASK:%.*]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr [[A]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V2:%.*]], ptr align 1 [[A:%.*]], <4 x i1> [[MASK:%.*]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v4i8.p0(<4 x i8> [[V1:%.*]], ptr align 1 [[A]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret i32 0
 ;
   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %v2, ptr %a, i32 1, <4 x i1> %mask)
diff --git a/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll b/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
index 9739772..94cbc24 100644
--- a/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
+++ b/llvm/test/Transforms/EarlyCSE/masked-intrinsics-unequal-masks.ll
@@ -12,7 +12,7 @@
 ; Expect the second load to be removed.
 define <4 x i32> @f3(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f3(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V0]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -26,8 +26,8 @@ define <4 x i32> @f3(ptr %a0, <4 x i32> %a1) {
 ; Expect the second load to remain.
 define <4 x i32> @f4(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f4(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -41,8 +41,8 @@ define <4 x i32> @f4(ptr %a0, <4 x i32> %a1) {
 ; Expect the second load to remain.
 define <4 x i32> @f5(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f5(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[V2:%.*]] = add <4 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    ret <4 x i32> [[V2]]
 ;
@@ -59,7 +59,7 @@ define <4 x i32> @f5(ptr %a0, <4 x i32> %a1) {
 ; Expect the first store to be removed.
 define void @f6(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @f6(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
@@ -71,8 +71,8 @@ define void @f6(<4 x i32> %a0, ptr %a1) {
 ; Expect both stores to remain.
 define void @f7(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @f7(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0]], ptr align 4 [[A1]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -87,7 +87,7 @@ define void @f7(<4 x i32> %a0, ptr %a1) {
 ; Expect the store to be removed.
 define <4 x i32> @f8(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f8(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   %v0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %a0, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %a1)
@@ -99,8 +99,8 @@ define <4 x i32> @f8(ptr %a0, <4 x i32> %a1) {
 ; Expect the store to remain.
 define <4 x i32> @f9(ptr %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @f9(
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A0:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V0]], ptr [[A0]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A0:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V0]], ptr align 4 [[A0]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   %v0 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %a0, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a1)
@@ -115,7 +115,7 @@ define <4 x i32> @f9(ptr %a0, <4 x i32> %a1) {
 ; Expect the load to be removed.
 define <4 x i32> @fa(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fa(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
 ; CHECK-NEXT:    ret <4 x i32> [[A0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -127,8 +127,8 @@ define <4 x i32> @fa(<4 x i32> %a0, ptr %a1) {
 ; Expect the load to remain.
 define <4 x i32> @fb(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fb(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A1]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
@@ -140,8 +140,8 @@ define <4 x i32> @fb(<4 x i32> %a0, ptr %a1) {
 ; Expect the load to remain.
 define <4 x i32> @fc(<4 x i32> %a0, ptr %a1) {
 ; CHECK-LABEL: @fc(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr [[A1:%.*]], i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
-; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[A1]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[A0:%.*]], ptr align 4 [[A1:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    [[V0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[A1]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> undef)
 ; CHECK-NEXT:    ret <4 x i32> [[V0]]
 ;
   call void @llvm.masked.store.v4i32.p0(<4 x i32> %a0, ptr %a1, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
diff --git a/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
index 0fcb4fd..ae729d5 100644
--- a/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
+++ b/llvm/test/Transforms/EarlyCSE/masked-intrinsics.ll
@@ -4,7 +4,7 @@
 define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A1]], ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A1]], ptr align 4 [[A0:%.*]], <128 x i1> [[V0]])
 ; CHECK-NEXT:    ret <128 x i8> [[A1]]
 ;
   %v0 = icmp eq <128 x i8> %a1, %a2
@@ -16,7 +16,7 @@ define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    ret <128 x i8> [[V1]]
 ;
   %v0 = icmp eq <128 x i8> %a1, %a2
@@ -28,7 +28,7 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 define <128 x i8> @f2(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]]
 ; CHECK-NEXT:    ret <128 x i8> [[V3]]
 ;
diff --git a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
index da507f1..d46159c 100644
--- a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
+++ b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
@@ -18,8 +18,8 @@ define i32 @different_types_load(ptr %p) {
 
 define i32 @different_types_vector_load(ptr %p) {
 ; CHECK-LABEL: @different_types_vector_load(
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P:%.*]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
-; CHECK-NEXT:    [[V2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[P]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[P:%.*]], <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
+; CHECK-NEXT:    [[V2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[P]], <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> poison)
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[V1]], i32 0
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[V2]], i32 6
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[E1]], [[E2]]
@@ -50,8 +50,8 @@ define i32 @different_types_store(ptr %p, i32 %a) {
 
 define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_load(
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]], <4 x i32> poison)
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[P:%.*]], <4 x i1> [[C:%.*]], <4 x i32> poison)
+; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[C]], <4 x float> poison)
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[V1]], i32 0
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    [[E2I:%.*]] = fptosi float [[E2]] to i32
@@ -69,8 +69,8 @@ define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 
 define float @different_elt_types_vector_store_load(ptr %p, <4 x i32> %v1, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_store_load(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V1:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]])
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V1:%.*]], ptr align 4 [[P:%.*]], <4 x i1> [[C:%.*]])
+; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[P]], <4 x i1> [[C]], <4 x float> poison)
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    ret float [[E2]]
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index d0aec18..87f64ed 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -253,20 +253,20 @@ declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr>, i32, <4 x
 define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test9
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR7:[0-9]+]] {
-; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR6:[0-9]+]] {
+; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
@@ -277,20 +277,20 @@ declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test10
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
-; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
+; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test10
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR5:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test10
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
-; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR6:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
@@ -301,19 +301,19 @@ declare <4 x i32> @test11_1(<4 x ptr>) argmemonly nounwind readonly
 define <4 x i32> @test11_2(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: nofree nounwind memory(argmem: read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test11_2
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]])
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: nosync nounwind memory(argmem: read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test11_2
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR3]]
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: nosync nounwind memory(argmem: read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test11_2
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test11_1(<4 x ptr> [[PTRS]]) #[[ATTR3]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -325,19 +325,19 @@ declare <4 x i32> @test12_1(<4 x ptr>) argmemonly nounwind
 define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 ; FNATTRS: Function Attrs: nounwind memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@test12_2
-; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR12:[0-9]+]] {
+; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; FNATTRS-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR: Function Attrs: nounwind memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test12_2
-; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: nounwind memory(argmem: readwrite)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test12_2
-; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR11:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @test12_1(<4 x ptr> [[PTRS]])
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -348,19 +348,19 @@ define <4 x i32> @test12_2(<4 x ptr> %ptrs) {
 define i32 @volatile_load(ptr %p) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@volatile_load
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; FNATTRS-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; FNATTRS-NEXT:    ret i32 [[LOAD]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@volatile_load
-; ATTRIBUTOR-SAME: (ptr nofree [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; ATTRIBUTOR-SAME: (ptr nofree [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; ATTRIBUTOR-NEXT:    ret i32 [[LOAD]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@volatile_load
-; ATTRIBUTOR-CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (ptr nofree [[P:%.*]]) #[[ATTR10:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr [[P]], align 4
 ; ATTRIBUTOR-CGSCC-NEXT:    ret i32 [[LOAD]]
 ;
@@ -570,7 +570,7 @@ define void @fptr_test2c(ptr %p, ptr %f) {
 define void @alloca_recphi() {
 ; FNATTRS: Function Attrs: nofree norecurse nosync nounwind memory(none)
 ; FNATTRS-LABEL: define {{[^@]+}}@alloca_recphi
-; FNATTRS-SAME: () #[[ATTR14:[0-9]+]] {
+; FNATTRS-SAME: () #[[ATTR12:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
 ; FNATTRS-NEXT:    [[A:%.*]] = alloca [8 x i32], align 4
 ; FNATTRS-NEXT:    [[A_END:%.*]] = getelementptr i32, ptr [[A]], i64 8
@@ -587,7 +587,7 @@ define void @alloca_recphi() {
 ;
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@alloca_recphi
-; ATTRIBUTOR-SAME: () #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-SAME: () #[[ATTR10:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  entry:
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = alloca [8 x i32], align 4
 ; ATTRIBUTOR-NEXT:    [[A_END:%.*]] = getelementptr i32, ptr [[A]], i64 8
@@ -723,19 +723,19 @@ define void @op_bundle_readonly_unknown(ptr %p) {
 define i32 @writable_readonly(ptr writable dereferenceable(4) %p) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; FNATTRS-LABEL: define {{[^@]+}}@writable_readonly
-; FNATTRS-SAME: (ptr readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR15:[0-9]+]] {
+; FNATTRS-SAME: (ptr readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
 ; FNATTRS-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; FNATTRS-NEXT:    ret i32 [[V]]
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@writable_readonly
-; ATTRIBUTOR-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; ATTRIBUTOR-NEXT:    ret i32 [[V]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@writable_readonly
-; ATTRIBUTOR-CGSCC-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-SAME: (ptr nofree nonnull readonly captures(none) dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
 ; ATTRIBUTOR-CGSCC-NEXT:    ret i32 [[V]]
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
index 2ccb1ef..bc0c809 100644
--- a/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
+++ b/llvm/test/Transforms/FunctionAttrs/vector-of-pointers-getunderlyingobject-crash.ll
@@ -4,7 +4,7 @@
 define double @getUnderlyingObject_vector_ptr(<4 x i1> %arg0, <4 x i1> %arg1) {
 ; CHECK-LABEL: define double @getUnderlyingObject_vector_ptr(
 ; CHECK-SAME: <4 x i1> [[ARG0:%.*]], <4 x i1> [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> splat (i64 8)), i32 0, <4 x i1> [[ARG0]], <4 x double> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 getelementptr inbounds (i8, <4 x ptr> zeroinitializer, <4 x i64> splat (i64 8)), <4 x i1> [[ARG0]], <4 x double> zeroinitializer)
 ; CHECK-NEXT:    [[REDUCE_FADD:%.*]] = tail call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[GATHER]])
 ; CHECK-NEXT:    ret double [[REDUCE_FADD]]
 ;
diff --git a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
index aeb3de9..f8703a8 100644
--- a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
+++ b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather-inseltpoison.ll
@@ -16,12 +16,12 @@ define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 ; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP_0]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
-; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN1]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN2]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
 ; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
 ; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
diff --git a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
index 4c00060..4d82cd0 100644
--- a/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
+++ b/llvm/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
@@ -16,12 +16,12 @@ define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 ; CHECK-NEXT:    [[TMP_1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP_I:%.*]] = insertelement <2 x ptr> undef, ptr [[TMP_0]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = insertelement <2 x ptr> [[TMP_I]], ptr [[TMP_1]], i32 1
-; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN1]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[IN2]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP]], i32 1, <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN1_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN1]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    [[IN2_V:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[IN2]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN1_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> [[IN2_V]], <2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP_V_1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 1 [[TMP]], <2 x i1> splat (i1 true), <2 x i32> undef)
 ; CHECK-NEXT:    [[TMP_V_1_0:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 0
 ; CHECK-NEXT:    [[TMP_V_1_1:%.*]] = extractelement <2 x i32> [[TMP_V_1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP_V_1_0]], ptr [[OUT]], align 4
diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
index 512ea37..c61a394 100644
--- a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
@@ -5,22 +5,22 @@
 define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
 ; MEMDEPFALSE-LABEL: @forward_binop_with_sel(
 ; MEMDEPFALSE-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; MEMDEPFALSE-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; MEMDEPFALSE-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; MEMDEPFALSE-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; MEMDEPFALSE-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; MEMDEPFALSE-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
-; MEMDEPFALSE-NEXT:    [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
+; MEMDEPFALSE-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
+; MEMDEPFALSE-NEXT:    [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP1]], <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
 ; MEMDEPFALSE-NEXT:    ret <4 x float> [[LOAD_1_0]]
 ;
   %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
diff --git a/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
index 466f787..2017a02 100644
--- a/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store-vn-crash.ll
@@ -5,7 +5,7 @@
 define fastcc void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD_1_I:%.*]] = tail call <4 x i64> @llvm.masked.load.v4i64.p0(ptr nonnull getelementptr inbounds ([8 x i64], ptr @file_mask, i64 0, i64 7), i32 8, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> undef)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD_1_I:%.*]] = tail call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 getelementptr inbounds ([8 x i64], ptr @file_mask, i64 0, i64 7), <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> undef)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index b112e99..ee9dbd7 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -7,7 +7,7 @@
 define <128 x i8> @f0(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0:%.*]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    [[V1:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0:%.*]], <128 x i1> [[V0]], <128 x i8> undef)
 ; CHECK-NEXT:    [[V3:%.*]] = add <128 x i8> [[V1]], [[V1]]
 ; CHECK-NEXT:    ret <128 x i8> [[V3]]
 ;
@@ -22,8 +22,8 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq <128 x i8> [[A1:%.*]], [[A2:%.*]]
 ; CHECK-NEXT:    [[V1:%.*]] = getelementptr <128 x i8>, ptr [[A0:%.*]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr [[A0]], i32 4, <128 x i1> [[V0]], <128 x i8> undef)
-; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A2]], ptr [[V1]], i32 4, <128 x i1> [[V0]])
+; CHECK-NEXT:    [[V2:%.*]] = call <128 x i8> @llvm.masked.load.v128i8.p0(ptr align 4 [[A0]], <128 x i1> [[V0]], <128 x i8> undef)
+; CHECK-NEXT:    call void @llvm.masked.store.v128i8.p0(<128 x i8> [[A2]], ptr align 4 [[V1]], <128 x i1> [[V0]])
 ; CHECK-NEXT:    [[V4:%.*]] = add <128 x i8> [[V2]], [[V2]]
 ; CHECK-NEXT:    ret <128 x i8> [[V4]]
 ;
@@ -38,9 +38,9 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
 
 define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
 ; CHECK-LABEL: @forward_masked_load(
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
-; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <4 x float> [[LOAD1]]
 ;
   %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
   %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
@@ -51,8 +51,8 @@ define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
 
 define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) {
 ; CHECK-LABEL: @forward_masked_load_arbitrary_mask(
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[LOC_A:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr align 1 [[LOC_B:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
@@ -64,11 +64,11 @@ define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <
 
 define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
 ; CHECK-LABEL: @forward_binop_splat_i1_mask(
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret <4 x float> [[FMUL]]
 ;
   %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
@@ -84,11 +84,11 @@ define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
 define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[TMP0:%.*]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[GEP_0_16]], <4 x i1> [[MASK]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
@@ -105,9 +105,9 @@ define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x
 define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_masked_load_scalable(
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[LOAD1]], <vscale x 4 x float> [[PASSTHROUGH]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -120,9 +120,9 @@ define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscal
 define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch(
 ; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
-; CHECK-NEXT:    [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD2]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -135,9 +135,9 @@ define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0,
 define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @generate_sel_with_passthrough(
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD1]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[LOAD1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
 ;
   %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
@@ -150,11 +150,11 @@ define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vsca
 define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @forward_binop_with_sel_scalable(
 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP3]]
 ;
@@ -172,12 +172,12 @@ define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x flo
 ; CHECK-LABEL: @load_mask_differs(
 ; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
 ; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
-; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK0]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
 ;
   %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
@@ -195,12 +195,12 @@ define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x fl
 ; CHECK-LABEL: @store_mask_differs(
 ; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
 ; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
-; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP0:%.*]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[GEP_0_16]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
-; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr align 1 [[TMP1:%.*]], <vscale x 4 x i1> [[MASK1]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[TMP1]], <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
 ;
   %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
diff --git a/llvm/test/Transforms/GVNSink/ptrtoaddr.ll b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
new file mode 100644
index 0000000..3a9d16e
--- /dev/null
+++ b/llvm/test/Transforms/GVNSink/ptrtoaddr.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn-sink < %s | FileCheck %s
+
+define i64 @test(i1 %c, ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    br label %[[JOIN:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[P2_SINK:%.*]] = phi ptr [ [[P2]], %[[ELSE]] ], [ [[P]], %[[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = ptrtoaddr ptr [[P2_SINK]] to i64
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  br i1 %c, label %if, label %else
+
+if:
+  %p.addr = ptrtoaddr ptr %p to i64
+  br label %join
+
+else:
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  br label %join
+
+join:
+  %phi = phi i64 [ %p.addr, %if ], [ %p2.addr, %else ]
+  ret i64 %phi
+}
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
index e14dfd0..ed58f39 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/masked-load-store.ll
@@ -4,7 +4,7 @@
 define <32 x i32> @masked_load_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_global_to_flat(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p1(ptr addrspace(1) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p1(ptr addrspace(1) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -14,7 +14,7 @@ define <32 x i32> @masked_load_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32
 define <32 x i32> @masked_load_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_local_to_flat(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p3(ptr addrspace(3) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p3(ptr addrspace(3) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -25,7 +25,7 @@ define <32 x i32> @masked_load_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x
 define <32 x i32> @masked_load_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define <32 x i32> @masked_load_v32i32_private_to_flat(
 ; CHECK-SAME: ptr addrspace(5) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p5(ptr addrspace(5) [[PTR]], i32 8, <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <32 x i32> @llvm.masked.load.v32i32.p5(ptr addrspace(5) align 8 [[PTR]], <32 x i1> [[MASK]], <32 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <32 x i32> [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -36,7 +36,7 @@ define <32 x i32> @masked_load_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32
 define void  @masked_store_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_global_to_flat(
 ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p1(<32 x i32> zeroinitializer, ptr addrspace(1) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p1(<32 x i32> zeroinitializer, ptr addrspace(1) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -47,7 +47,7 @@ define void  @masked_store_v32i32_global_to_flat(ptr addrspace(1) %ptr, <32 x i1
 define void  @masked_store_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_local_to_flat(
 ; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p3(<32 x i32> zeroinitializer, ptr addrspace(3) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p3(<32 x i32> zeroinitializer, ptr addrspace(3) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -58,7 +58,7 @@ define void  @masked_store_v32i32_local_to_flat(ptr addrspace(3) %ptr, <32 x i1>
 define void  @masked_store_v32i32_private_to_flat(ptr addrspace(5) %ptr, <32 x i1> %mask) {
 ; CHECK-LABEL: define void @masked_store_v32i32_private_to_flat(
 ; CHECK-SAME: ptr addrspace(5) [[PTR:%.*]], <32 x i1> [[MASK:%.*]]) {
-; CHECK-NEXT:    tail call void @llvm.masked.store.v32i32.p5(<32 x i32> zeroinitializer, ptr addrspace(5) [[PTR]], i32 128, <32 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v32i32.p5(<32 x i32> zeroinitializer, ptr addrspace(5) align 128 [[PTR]], <32 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
   %cast = addrspacecast ptr addrspace(5) %ptr to ptr
diff --git a/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll b/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
index e6b27cc..46af2bf 100644
--- a/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/masked-gather-scatter.ll
@@ -6,7 +6,7 @@ define <4 x i32> @masked_gather_inferas(ptr addrspace(1) %out, <4 x i64> %index)
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <4 x i64> [[INDEX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], <4 x i64> [[INDEX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> [[PTRS]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[VALUE:%.*]] = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> align 4 [[PTRS]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    ret <4 x i32> [[VALUE]]
 ;
 entry:
@@ -21,7 +21,7 @@ define void @masked_scatter_inferas(ptr addrspace(1) %out, <4 x i64> %index, <4
 ; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <4 x i64> [[INDEX:%.*]], <4 x i32> [[VALUE:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT]], <4 x i64> [[INDEX]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> [[VALUE]], <4 x ptr addrspace(1)> [[PTRS]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> [[VALUE]], <4 x ptr addrspace(1)> align 4 [[PTRS]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/InferAlignment/masked.ll b/llvm/test/Transforms/InferAlignment/masked.ll
index 13acf9b..12b5c2c 100644
--- a/llvm/test/Transforms/InferAlignment/masked.ll
+++ b/llvm/test/Transforms/InferAlignment/masked.ll
@@ -6,7 +6,7 @@ define <2 x i32> @load(<2 x i1> %mask, ptr %ptr) {
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 64) ]
-; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[PTR]], i32 64, <2 x i1> [[MASK]], <2 x i32> poison)
+; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 64 [[PTR]], <2 x i1> [[MASK]], <2 x i32> poison)
 ; CHECK-NEXT:    ret <2 x i32> [[MASKED_LOAD]]
 ;
 entry:
@@ -20,7 +20,7 @@ define void @store(<2 x i1> %mask, <2 x i32> %val, ptr %ptr) {
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], <2 x i32> [[VAL:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[PTR]], i64 64) ]
-; CHECK-NEXT:    tail call void @llvm.masked.store.v2i32.p0(<2 x i32> [[VAL]], ptr [[PTR]], i32 64, <2 x i1> [[MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v2i32.p0(<2 x i32> [[VAL]], ptr align 64 [[PTR]], <2 x i1> [[MASK]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -33,11 +33,11 @@ define <2 x i32> @null(<2 x i1> %mask, <2 x i32> %val) {
 ; CHECK-LABEL: define <2 x i32> @null(
 ; CHECK-SAME: <2 x i1> [[MASK:%.*]], <2 x i32> [[VAL:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr null, i32 1, <2 x i1> [[MASK]], <2 x i32> [[VAL]])
+; CHECK-NEXT:    [[MASKED_LOAD:%.*]] = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4294967296 null, <2 x i1> [[MASK]], <2 x i32> [[VAL]])
 ; CHECK-NEXT:    ret <2 x i32> [[MASKED_LOAD]]
 ;
 entry:
-  %masked_load = tail call <2 x i32> @llvm.masked.load.v2f64.p0(ptr null, i32 1, <2 x i1> %mask, <2 x i32> %val)
+  %masked_load = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr null, i32 1, <2 x i1> %mask, <2 x i32> %val)
   ret <2 x i32> %masked_load
 }
 
diff --git a/llvm/test/Transforms/Inline/ML/recursive.ll b/llvm/test/Transforms/Inline/ML/recursive.ll
new file mode 100644
index 0000000..2d9240a
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/recursive.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; REQUIRES: llvm_inliner_model_autogenerated
+; RUN: opt -S %s -o - -passes='inliner-ml-advisor-release' -ml-inliner-skip-policy=if-caller-not-cold | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android29"
+
+define i32 @a_func(ptr %this, i32 %color_id, i1 %dark_mode) local_unnamed_addr {
+; CHECK-LABEL: define i32 @a_func(
+; CHECK-SAME: ptr [[THIS:%.*]], i32 [[COLOR_ID:%.*]], i1 [[DARK_MODE:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[DARK_MODE]], label %[[SW_BB97:.*]], label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       [[SW_BB97]]:
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br i1 %dark_mode, label %sw.bb97, label %common.ret
+
+common.ret:                                       ; preds = %sw.bb97, %entry
+  ret i32 0
+
+sw.bb97:                                          ; preds = %entry
+  %call.i = tail call i32 @a_func(ptr null, i32 0, i1 false)
+  br label %common.ret
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare ptr @llvm.load.relative.i32(ptr %0, i32 %1) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+;.
diff --git a/llvm/test/Transforms/Inline/pr50589.ll b/llvm/test/Transforms/Inline/pr50589.ll
index 7be163f..cae0f12 100644
--- a/llvm/test/Transforms/Inline/pr50589.ll
+++ b/llvm/test/Transforms/Inline/pr50589.ll
@@ -6,7 +6,7 @@
 
 define <2 x i8> @callee1(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask, <2 x i8> %passthru) {
 ; CHECK-LABEL: @callee1(
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr [[PTR1:%.*]], i32 1, <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 [[PTR1:%.*]], <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2:%.*]], align 2
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
@@ -18,9 +18,9 @@ define <2 x i8> @callee1(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask, <2 x i8>
 ; The load should not have !noalias.
 define void @caller1(ptr %ptr1, ptr %ptr2) {
 ; CHECK-LABEL: @caller1(
-; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2{{$}}
+; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope !0
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %passthru = load <2 x i8>, ptr %ptr2
@@ -31,7 +31,7 @@ define void @caller1(ptr %ptr1, ptr %ptr2) {
 define <2 x i8> @callee2(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask) {
 ; CHECK-LABEL: @callee2(
 ; CHECK-NEXT:    [[PASSTHRU:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr [[PTR1:%.*]], i32 1, <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU]])
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr align 1 [[PTR1:%.*]], <2 x i1> [[MASK:%.*]], <2 x i8> [[PASSTHRU]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2
 ; CHECK-NEXT:    ret <2 x i8> [[RET]]
 ;
@@ -45,8 +45,8 @@ define <2 x i8> @callee2(ptr %ptr1, ptr noalias %ptr2, <2 x i1> %mask) {
 define void @caller2(ptr %ptr1, ptr %ptr2) {
 ; CHECK-LABEL: @caller2(
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
-; CHECK-NEXT:    [[PASSTHRU_I:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2, !alias.scope !3{{$}}
-; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope !3
+; CHECK-NEXT:    [[PASSTHRU_I:%.*]] = load <2 x i8>, ptr [[PTR2:%.*]], align 2, !alias.scope [[META3]]
+; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[PTR2]], align 2, !alias.scope [[META3]]
 ; CHECK-NEXT:    ret void
 ;
   call <2 x i8> @callee2(ptr %ptr1, ptr %ptr2, <2 x i1> zeroinitializer)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
index 5b40d2e..9f3fdcd 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-gatherscatter.ll
@@ -10,7 +10,7 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, ptr %x, i64 %base) #0 {
 ; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 1 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -32,7 +32,7 @@ define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride2_negtest(<vsc
 define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, ptr align 8 %x, i64 %base) #0 {
 ; CHECK-LABEL: @test_ld1_gather_index_nxv2f64_stride1_align8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 8, <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]], <vscale x 2 x double> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[LD]]
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -47,7 +47,7 @@ define <vscale x 2 x double> @test_ld1_gather_index_nxv2f64_stride1_align8(<vsca
 define void @test_st1_scatter_index_nxv2f64_stride1(<vscale x 2 x i1> %pred, ptr %x, i64 %base, <vscale x 2 x double> %val) #0 {
 ; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr [[TMP1]], i32 1, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr align 1 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
@@ -69,7 +69,7 @@ define void @test_st1_scatter_index_nxv2f64_stride2_negtest(<vscale x 2 x i1> %p
 define void @test_st1_scatter_index_nxv2f64_stride1_align8(<vscale x 2 x i1> %pred, ptr align 8 %x, i64 %base, <vscale x 2 x double> %val) #0 {
 ; CHECK-LABEL: @test_st1_scatter_index_nxv2f64_stride1_align8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[X:%.*]], i64 [[BASE:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr [[TMP1]], i32 8, <vscale x 2 x i1> [[PRED:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[VAL:%.*]], ptr align 8 [[TMP1]], <vscale x 2 x i1> [[PRED:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %idx = tail call <vscale x 2 x i64> @llvm.aarch64.sve.index.nxv2i64(i64 %base, i64 1)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
index d8d6740..fe2a183 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
@@ -28,7 +28,7 @@ define <vscale x 4 x i32> @combine_ld1_casted_predicate(ptr %ptr) #0 {
 define <vscale x 4 x i32> @combine_ld1_masked(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 1 [[PTR:%.*]], <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP3]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -40,7 +40,7 @@ define <vscale x 8 x i16> @combine_ld1_masked_casted_predicate(ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_ld1_masked_casted_predicate(
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PTR:%.*]], <vscale x 8 x i1> [[TMP3]], <vscale x 8 x i16> zeroinitializer), !annotation [[META0]]
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP5]]
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -75,7 +75,7 @@ define void @combine_st1_casted_predicate(<vscale x 4 x i32> %vec, ptr %ptr) #0
 define void @combine_st1_masked(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
 ; CHECK-LABEL: @combine_st1_masked(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 4 x i1> [[TMP1]]), !annotation [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 4 x i1> [[TMP1]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
@@ -87,7 +87,7 @@ define void @combine_st1_masked_casted_predicate(<vscale x 8 x i16> %vec, ptr %p
 ; CHECK-LABEL: @combine_st1_masked_casted_predicate(
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 8 x i1> [[TMP3]]), !annotation [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> [[VEC:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 8 x i1> [[TMP3]]), !annotation [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
index 5e2da42..297d2b6 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-masked-memops.ll
@@ -19,7 +19,7 @@ define <4 x float> @mload(ptr %f, <4 x i32> %mask) {
 define <4 x float> @mload_v4f32_cmp(ptr %f, <4 x i32> %src) {
 ; CHECK-LABEL: @mload_v4f32_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne <4 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LD]]
 ;
   %icmp = icmp ne <4 x i32> %src, zeroinitializer
@@ -63,7 +63,7 @@ define <4 x float> @mload_real_ones(ptr %f) {
 
 define <4 x float> @mload_one_one(ptr %f) {
 ; CHECK-LABEL: @mload_one_one(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>)
 ; CHECK-NEXT:    ret <4 x float> [[LD]]
 ;
   %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -74,7 +74,7 @@ define <4 x float> @mload_one_one(ptr %f) {
 
 define <2 x double> @mload_one_one_double(ptr %f) {
 ; CHECK-LABEL: @mload_one_one_double(
-; CHECK-NEXT:    [[LD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x double> <double poison, double 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> <double poison, double 0.000000e+00>)
 ; CHECK-NEXT:    ret <2 x double> [[LD]]
 ;
   %ld = tail call <2 x double> @llvm.x86.avx.maskload.pd(ptr %f, <2 x i64> <i64 -1, i64 0>)
@@ -85,7 +85,7 @@ define <2 x double> @mload_one_one_double(ptr %f) {
 
 define <8 x float> @mload_v8f32(ptr %f) {
 ; CHECK-LABEL: @mload_v8f32(
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>)
 ; CHECK-NEXT:    ret <8 x float> [[LD]]
 ;
   %ld = tail call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -97,7 +97,7 @@ define <8 x float> @mload_v8f32_cmp(ptr %f, <8 x float> %src0, <8 x float> %src1
 ; CHECK-NEXT:    [[ICMP0:%.*]] = fcmp one <8 x float> [[SRC0:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[ICMP1:%.*]] = fcmp one <8 x float> [[SRC1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[MASK1:%.*]] = and <8 x i1> [[ICMP0]], [[ICMP1]]
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[F:%.*]], i32 1, <8 x i1> [[MASK1]], <8 x float> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 1 [[F:%.*]], <8 x i1> [[MASK1]], <8 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <8 x float> [[LD]]
 ;
   %icmp0 = fcmp one <8 x float> %src0, zeroinitializer
@@ -111,7 +111,7 @@ define <8 x float> @mload_v8f32_cmp(ptr %f, <8 x float> %src0, <8 x float> %src1
 
 define <4 x double> @mload_v4f64(ptr %f) {
 ; CHECK-LABEL: @mload_v4f64(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>)
 ; CHECK-NEXT:    ret <4 x double> [[LD]]
 ;
   %ld = tail call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
@@ -122,7 +122,7 @@ define <4 x double> @mload_v4f64(ptr %f) {
 
 define <4 x i32> @mload_v4i32(ptr %f) {
 ; CHECK-LABEL: @mload_v4i32(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>)
 ; CHECK-NEXT:    ret <4 x i32> [[LD]]
 ;
   %ld = tail call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>)
@@ -131,7 +131,7 @@ define <4 x i32> @mload_v4i32(ptr %f) {
 
 define <2 x i64> @mload_v2i64(ptr %f) {
 ; CHECK-LABEL: @mload_v2i64(
-; CHECK-NEXT:    [[LD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 poison, i64 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>, <2 x i64> <i64 poison, i64 0>)
 ; CHECK-NEXT:    ret <2 x i64> [[LD]]
 ;
   %ld = tail call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %f, <2 x i64> <i64 -1, i64 0>)
@@ -140,7 +140,7 @@ define <2 x i64> @mload_v2i64(ptr %f) {
 
 define <8 x i32> @mload_v8i32(ptr %f) {
 ; CHECK-LABEL: @mload_v8i32(
-; CHECK-NEXT:    [[LD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> <i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0>)
 ; CHECK-NEXT:    ret <8 x i32> [[LD]]
 ;
   %ld = tail call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %f, <8 x i32> <i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0>)
@@ -149,7 +149,7 @@ define <8 x i32> @mload_v8i32(ptr %f) {
 
 define <4 x i64> @mload_v4i64(ptr %f) {
 ; CHECK-LABEL: @mload_v4i64(
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 poison, i64 0, i64 0, i64 0>)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> <i64 poison, i64 0, i64 0, i64 0>)
 ; CHECK-NEXT:    ret <4 x i64> [[LD]]
 ;
   %ld = tail call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 0, i64 0>)
@@ -159,7 +159,7 @@ define <4 x i64> @mload_v4i64(ptr %f) {
 define <4 x i64> @mload_v4i64_cmp(ptr %f, <4 x i64> %src) {
 ; CHECK-LABEL: @mload_v4i64_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt <4 x i64> [[SRC:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]], <4 x i64> zeroinitializer)
+; CHECK-NEXT:    [[LD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]], <4 x i64> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i64> [[LD]]
 ;
   %icmp = icmp sge <4 x i64> %src, zeroinitializer
@@ -186,7 +186,7 @@ define void @mstore(ptr %f, <4 x i32> %mask, <4 x float> %v) {
 define void @mstore_v4f32_cmp(ptr %f, <4 x i32> %src, <4 x float> %v) {
 ; CHECK-LABEL: @mstore_v4f32_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq <4 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp = icmp eq <4 x i32> %src, zeroinitializer
@@ -230,7 +230,7 @@ define void @mstore_real_ones(ptr %f, <4 x float> %v) {
 
 define void @mstore_one_one(ptr %f, <4 x float> %v) {
 ; CHECK-LABEL: @mstore_one_one(
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.ps(ptr %f, <4 x i32> <i32 0, i32 0, i32 0, i32 -1>, <4 x float> %v)
@@ -241,7 +241,7 @@ define void @mstore_one_one(ptr %f, <4 x float> %v) {
 
 define void @mstore_one_one_double(ptr %f, <2 x double> %v) {
 ; CHECK-LABEL: @mstore_one_one_double(
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.pd(ptr %f, <2 x i64> <i64 -1, i64 0>, <2 x double> %v)
@@ -252,7 +252,7 @@ define void @mstore_one_one_double(ptr %f, <2 x double> %v) {
 
 define void @mstore_v8f32(ptr %f, <8 x float> %v) {
 ; CHECK-LABEL: @mstore_v8f32(
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[V:%.*]], ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[V:%.*]], ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.ps.256(ptr %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x float> %v)
@@ -261,7 +261,7 @@ define void @mstore_v8f32(ptr %f, <8 x float> %v) {
 
 define void @mstore_v4f64(ptr %f, <4 x double> %v) {
 ; CHECK-LABEL: @mstore_v4f64(
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx.maskstore.pd.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x double> %v)
@@ -271,7 +271,7 @@ define void @mstore_v4f64(ptr %f, <4 x double> %v) {
 define void @mstore_v4f64_cmp(ptr %f, <4 x i32> %src, <4 x double> %v) {
 ; CHECK-LABEL: @mstore_v4f64_cmp(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt <4 x i32> [[SRC:%.*]], splat (i32 -1)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[ICMP]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[ICMP]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp = icmp sge <4 x i32> %src, zeroinitializer
@@ -284,7 +284,7 @@ define void @mstore_v4f64_cmp(ptr %f, <4 x i32> %src, <4 x double> %v) {
 
 define void @mstore_v4i32(ptr %f, <4 x i32> %v) {
 ; CHECK-LABEL: @mstore_v4i32(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.d(ptr %f, <4 x i32> <i32 0, i32 1, i32 -1, i32 -2>, <4 x i32> %v)
@@ -293,7 +293,7 @@ define void @mstore_v4i32(ptr %f, <4 x i32> %v) {
 
 define void @mstore_v2i64(ptr %f, <2 x i64> %v) {
 ; CHECK-LABEL: @mstore_v2i64(
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.q(ptr %f, <2 x i64> <i64 -1, i64 0>, <2 x i64> %v)
@@ -303,7 +303,7 @@ define void @mstore_v2i64(ptr %f, <2 x i64> %v) {
 
 define void @mstore_v8i32(ptr %f, <8 x i32> %v) {
 ; CHECK-LABEL: @mstore_v8i32(
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[V:%.*]], ptr [[F:%.*]], i32 1, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[V:%.*]], ptr align 1 [[F:%.*]], <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.d.256(ptr %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 -1, i32 -2, i32 -3, i32 -4>, <8 x i32> %v)
@@ -312,7 +312,7 @@ define void @mstore_v8i32(ptr %f, <8 x i32> %v) {
 
 define void @mstore_v4i64(ptr %f, <4 x i64> %v) {
 ; CHECK-LABEL: @mstore_v4i64(
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.avx2.maskstore.q.256(ptr %f, <4 x i64> <i64 -1, i64 0, i64 1, i64 2>, <4 x i64> %v)
@@ -324,7 +324,7 @@ define void @mstore_v4i64_cmp(ptr %f, <4 x i64> %src0, <4 x i64> %src1, <4 x i64
 ; CHECK-NEXT:    [[ICMP0:%.*]] = icmp eq <4 x i64> [[SRC0:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[ICMP1:%.*]] = icmp ne <4 x i64> [[SRC1:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[MASK1:%.*]] = and <4 x i1> [[ICMP0]], [[ICMP1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr [[F:%.*]], i32 1, <4 x i1> [[MASK1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[V:%.*]], ptr align 1 [[F:%.*]], <4 x i1> [[MASK1]])
 ; CHECK-NEXT:    ret void
 ;
   %icmp0 = icmp eq <4 x i64> %src0, zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index a8117ce..2f1f9fc 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -210,7 +210,7 @@ declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>)
 define <2 x i32> @non_speculatable(i1 %b) {
 ; CHECK-LABEL: @non_speculatable(
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[B:%.*]], ptr @g1, ptr @g2
-; CHECK-NEXT:    [[C:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr nonnull [[S]], i32 64, <2 x i1> <i1 true, i1 false>, <2 x i32> poison)
+; CHECK-NEXT:    [[C:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr nonnull align 64 [[S]], <2 x i1> <i1 true, i1 false>, <2 x i32> poison)
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %s = select i1 %b, ptr @g1, ptr @g2
diff --git a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
index 918ea60..b458b23 100644
--- a/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-masked-constant-array.ll
@@ -7,8 +7,8 @@
 define void @combine_masked_load_store_from_constant_array(ptr %ptr) {
 ; CHECK-LABEL: @combine_masked_load_store_from_constant_array(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 0, i32 10)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull @contant_int_array, i32 8, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP2]], ptr [[PTR:%.*]], i32 1, <vscale x 2 x i1> [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nonnull align 8 @contant_int_array, <vscale x 2 x i1> [[TMP1]], <vscale x 2 x i64> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP2]], ptr align 1 [[PTR:%.*]], <vscale x 2 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
   %1 = alloca [10 x i64]
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
index 155a7fd..02539d1 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
@@ -37,7 +37,7 @@ define <2 x double> @load_undefmask(ptr %ptr, <2 x double> %passthru)  {
 
 define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 ; CHECK-LABEL: @load_cemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (ptr @G to i1)>, <2 x double> %passthru)
@@ -47,7 +47,7 @@ define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 ; CHECK-LABEL: @load_lane0(
 ; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -59,7 +59,7 @@ define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 define double @load_all(ptr %base, double %pt)  {
 ; CHECK-LABEL: @load_all(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 0, i64 poison, i64 2, i64 3>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
 ; CHECK-NEXT:    [[ELT:%.*]] = extractelement <4 x double> [[RES]], i64 2
 ; CHECK-NEXT:    ret double [[ELT]]
 ;
@@ -73,7 +73,7 @@ define <2 x double> @load_generic(ptr %ptr, double %pt, <2 x i1> %mask)  {
 ; CHECK-LABEL: @load_generic(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -116,7 +116,7 @@ define <2 x double> @load_spec_neg_size(ptr dereferenceable(8) %ptr, double %pt,
 ; CHECK-LABEL: @load_spec_neg_size(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -131,7 +131,7 @@ define <2 x double> @load_spec_lan0(ptr dereferenceable(8) %ptr, double %pt, <2
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
@@ -161,7 +161,7 @@ define void @store_onemask(ptr %ptr, <2 x double> %val)  {
 define void @store_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @store_demandedelts(
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr [[PTR:%.*]], i32 4, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr align 4 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %valvec1 = insertelement <2 x double> poison, double %val, i32 0
@@ -172,7 +172,7 @@ define void @store_demandedelts(ptr %ptr, double %val)  {
 
 define <2 x double> @gather_generic(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_generic(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %passthru)
@@ -191,7 +191,7 @@ define <2 x double> @gather_zeromask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 
 define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_onemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
@@ -203,7 +203,7 @@ define <4 x double> @gather_lane2(ptr %base, double %pt)  {
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 0>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
@@ -219,7 +219,7 @@ define <2 x double> @gather_lane0_maybe(ptr %base, double %pt, <2 x i1> %mask)
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -236,7 +236,7 @@ define <2 x double> @gather_lane0_maybe_spec(ptr %base, double %pt, <2 x i1> %ma
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -260,7 +260,7 @@ define void @scatter_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @scatter_demandedelts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[PTR:%.*]], <2 x i64> <i64 0, i64 poison>
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> [[PTRS]], i32 8, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> align 8 [[PTRS]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr double, ptr %ptr, <2 x i64> <i64 0, i64 1>
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 67ab167..6aadb08 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -46,7 +46,7 @@ define <2 x double> @load_undefmask(ptr %ptr, <2 x double> %passthru)  {
 
 define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 ; CHECK-LABEL: @load_cemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 ptrtoint (ptr @G to i1)>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 2, <2 x i1> <i1 1, i1 ptrtoint (ptr @G to i1)>, <2 x double> %passthru)
@@ -56,7 +56,7 @@ define <2 x double> @load_cemask(ptr %ptr, <2 x double> %passthru)  {
 define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 ; CHECK-LABEL: @load_lane0(
 ; CHECK-NEXT:    [[PTV2:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 2, <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 2 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>, <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -68,7 +68,7 @@ define <2 x double> @load_lane0(ptr %ptr, double %pt)  {
 define double @load_all(ptr %base, double %pt)  {
 ; CHECK-LABEL: @load_all(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 0, i64 poison, i64 2, i64 3>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x double> poison)
 ; CHECK-NEXT:    [[ELT:%.*]] = extractelement <4 x double> [[RES]], i64 2
 ; CHECK-NEXT:    ret double [[ELT]]
 ;
@@ -82,7 +82,7 @@ define <2 x double> @load_generic(ptr %ptr, double %pt, <2 x i1> %mask)  {
 ; CHECK-LABEL: @load_generic(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -125,7 +125,7 @@ define <2 x double> @load_spec_neg_size(ptr dereferenceable(8) %ptr, double %pt,
 ; CHECK-LABEL: @load_spec_neg_size(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -140,7 +140,7 @@ define <2 x double> @load_spec_lan0(ptr dereferenceable(8) %ptr, double %pt, <2
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0(ptr nonnull align 4 [[PTR:%.*]], <2 x i1> [[MASK2]], <2 x double> [[PTV2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
@@ -187,7 +187,7 @@ define void @store_one_withpoison_mask(ptr %ptr, <2 x double> %val)  {
 define void @store_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @store_demandedelts(
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr [[PTR:%.*]], i32 4, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v2f64.p0(<2 x double> [[VALVEC1]], ptr align 4 [[PTR:%.*]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %valvec1 = insertelement <2 x double> undef, double %val, i32 0
@@ -198,7 +198,7 @@ define void @store_demandedelts(ptr %ptr, double %val)  {
 
 define <2 x double> @gather_generic(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_generic(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> [[MASK:%.*]], <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %passthru)
@@ -224,7 +224,7 @@ define <2 x double> @gather_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %
 
 define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_onemask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x double> %passthru)
@@ -233,7 +233,7 @@ define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 
 define <2 x double> @gather_one_withpoisonmask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_one_withpoisonmask(
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS:%.*]], <2 x i1> <i1 true, i1 poison>, <2 x double> [[PASSTHRU:%.*]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> %passthru)
@@ -245,7 +245,7 @@ define <4 x double> @gather_lane2(ptr %base, double %pt)  {
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <4 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <4 x double> [[PT_V1]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 0>
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 4 [[PTRS]], <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
@@ -261,7 +261,7 @@ define <2 x double> @gather_lane0_maybe(ptr %base, double %pt, <2 x i1> %mask)
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -278,7 +278,7 @@ define <2 x double> @gather_lane0_maybe_spec(ptr %base, double %pt, <2 x i1> %ma
 ; CHECK-NEXT:    [[PT_V1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MASK2:%.*]] = insertelement <2 x i1> [[MASK:%.*]], i1 false, i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS]], i32 4, <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 4 [[PTRS]], <2 x i1> [[MASK2]], <2 x double> [[PT_V2]])
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptrs = getelementptr double, ptr %base, <2 x i64> <i64 0, i64 1>
@@ -308,7 +308,7 @@ define void @scatter_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
 
 define void @scatter_one_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
 ; CHECK-LABEL: @scatter_one_withpoison_mask(
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> <i1 true, i1 poison>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VAL:%.*]], <2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> <i1 true, i1 poison>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %val, <2 x ptr> %ptrs, i32 8, <2 x i1> <i1 1, i1 poison>)
@@ -319,7 +319,7 @@ define void @scatter_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @scatter_demandedelts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[PTR:%.*]], <2 x i64> <i64 0, i64 poison>
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> [[PTRS]], i32 8, <2 x i1> <i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VALVEC1]], <2 x ptr> align 8 [[PTRS]], <2 x i1> <i1 true, i1 false>)
 ; CHECK-NEXT:    ret void
 ;
   %ptrs = getelementptr double, ptr %ptr, <2 x i64> <i64 0, i64 1>
@@ -407,7 +407,7 @@ define void @negative_scatter_v4i16_no_uniform_vals_uniform_ptrs_all_inactive_ma
 ; CHECK-NEXT:    [[INSERT_ELT:%.*]] = insertelement <4 x ptr> poison, ptr [[DST:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[INSERT_ELT]], <4 x ptr> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[SRC:%.*]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> [[BROADCAST_SPLAT]], i32 2, <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> align 2 [[BROADCAST_SPLAT]], <4 x i1> <i1 false, i1 false, i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
 ;
   %insert.elt = insertelement <4 x ptr> poison, ptr %dst, i32 0
@@ -422,7 +422,7 @@ define void @negative_scatter_v4i16_no_uniform_vals_no_uniform_ptrs_all_active_m
 ; CHECK-LABEL: @negative_scatter_v4i16_no_uniform_vals_no_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x ptr> [[INPTR:%.*]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[SRC:%.*]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> [[BROADCAST]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[WIDE_LOAD]], <4 x ptr> align 2 [[BROADCAST]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
   %broadcast= shufflevector <4 x ptr> %inPtr, <4 x ptr> poison, <4 x i32> zeroinitializer
@@ -471,7 +471,7 @@ define <2 x i64> @gather_v2i64_uniform_ptrs_all_active_mask(ptr %src) {
 define <2 x i64> @negative_gather_v2i64_non_uniform_ptrs_all_active_mask(<2 x ptr> %inVal, ptr %src ) {
 ; CHECK-LABEL: @negative_gather_v2i64_non_uniform_ptrs_all_active_mask(
 ; CHECK-NEXT:    [[INSERT_VALUE:%.*]] = insertelement <2 x ptr> [[INVAL:%.*]], ptr [[SRC:%.*]], i64 1
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[INSERT_VALUE]], i32 8, <2 x i1> splat (i1 true), <2 x i64> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[INSERT_VALUE]], <2 x i1> splat (i1 true), <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %insert.value = insertelement <2 x ptr> %inVal, ptr %src, i32 1
@@ -484,7 +484,7 @@ define <2 x i64> @negative_gather_v2i64_uniform_ptrs_no_all_active_mask(ptr %src
 ; CHECK-LABEL: @negative_gather_v2i64_uniform_ptrs_no_all_active_mask(
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[SRC:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[BROADCAST_SPLAT]], i32 8, <2 x i1> [[MASK:%.*]], <2 x i64> undef)
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[BROADCAST_SPLAT]], <2 x i1> [[MASK:%.*]], <2 x i64> undef)
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %broadcast.splatinsert = insertelement <2 x ptr> poison, ptr %src, i32 0
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
index 679230a4..18468d1 100644
--- a/llvm/test/Transforms/InstCombine/pr83947.ll
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -6,7 +6,7 @@
 
 define void @masked_scatter1() {
 ; CHECK-LABEL: define void @masked_scatter1() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 ptrtoint (ptr @b to i1), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> align 4 shufflevector (<vscale x 4 x ptr> insertelement (<vscale x 4 x ptr> poison, ptr @c, i64 0), <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 ptrtoint (ptr @b to i1), i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x ptr> splat (ptr @c), i32 4, <vscale x 4 x i1> splat (i1 ptrtoint (ptr @b to i1)))
@@ -57,7 +57,7 @@ define void @masked_scatter6() {
 
 define void @masked_scatter7() {
 ; CHECK-LABEL: define void @masked_scatter7() {
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> <ptr @c, ptr @c>, i32 4, <2 x i1> <i1 ptrtoint (ptr @b to i1), i1 ptrtoint (ptr @b to i1)>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> align 4 <ptr @c, ptr @c>, <2 x i1> <i1 ptrtoint (ptr @b to i1), i1 ptrtoint (ptr @b to i1)>)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> splat (i1 ptrtoint (ptr @b to i1)))
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
deleted file mode 100644
index ffaa8b1..0000000
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ /dev/null
@@ -1,163 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-; The ptrtoaddr folds are also valid for pointers that have external state.
-target datalayout = "pe1:64:64:64:32"
-
-@g = external global i8
-@g2 = external global i8
-
-@g.as1 = external addrspace(1) global i8
-@g2.as1 = external addrspace(1) global i8
-
-define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
-; CHECK-SAME: i64 [[A:%.*]]) {
-; CHECK-NEXT:    ret i64 [[A]]
-;
-  %toptr = inttoptr i64 %a to ptr
-  %toaddr = ptrtoaddr ptr %toptr to i64
-  ret i64 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    ret i32 [[A]]
-;
-  %toptr = inttoptr i32 %a to ptr addrspace(1)
-  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
-  ret i32 %toaddr
-}
-
-define i32 @ptrtoaddr_inttoptr() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
-; CHECK-NEXT:    ret i32 -1
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
-}
-
-define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
-; CHECK-NEXT:    ret i32 65535
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
-; CHECK-NEXT:    ret i64 1
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
-}
-
-define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
-; CHECK-NEXT:    ret i64 123
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
-; CHECK-NEXT:    ret i64 4294967295
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
-; CHECK-NEXT:    ret i64 -1
-;
-  ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
-}
-
-define i64 @ptrtoaddr_gep_null() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 42) to i64)
-}
-
-define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
-; CHECK-NEXT:    ret i32 42
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32)
-}
-
-define i64 @ptrtoaddr_gep_sub() {
-; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64)
-}
-
-define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
-; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
-;
-  ret i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32)
-}
-
-; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
-; exposed provenance, which is not necessarily that of @g (especially as
-; ptrtoaddr does not expose the provenance.)
-define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
-; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-;
-  ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
-}
-
-define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
-; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-;
-  ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
-}
-
-define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
-; CHECK-NEXT:    ret i64 42
-;
-  ret i64 sub (i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), i64 ptrtoaddr (ptr @g to i64))
-}
-
-define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
-; CHECK-NEXT:    ret i32 42
-;
-  ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
-}
-
-define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
-; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
-; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    ret i64 42
-;
-  %p2 = getelementptr inbounds i8, ptr %p, i64 42
-  %p.addr = ptrtoaddr ptr %p to i64
-  %p2.addr = ptrtoaddr ptr %p2 to i64
-  %sub = sub i64 %p2.addr, %p.addr
-  ret i64 %sub
-}
-
-define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
-; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
-; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    ret i32 42
-;
-  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
-  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
-  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
-  %sub = sub i32 %p2.addr, %p.addr
-  ret i32 %sub
-}
diff --git a/llvm/test/Transforms/InstCombine/select-masked_gather.ll b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
index b6b433c..410d594 100644
--- a/llvm/test/Transforms/InstCombine/select-masked_gather.ll
+++ b/llvm/test/Transforms/InstCombine/select-masked_gather.ll
@@ -4,7 +4,7 @@
 ; Fold zeroing of inactive lanes into the gather's passthrough parameter.
 define <vscale x 2 x float> @masked_gather_and_zero_inactive_1(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_1(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
 ;
   %gather = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
@@ -15,7 +15,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_1(<vscale x 2 x ptr
 ; As above but reuse the gather's existing passthrough.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_2(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_2(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %gather = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x ptr> %ptr, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> zeroinitializer)
@@ -26,7 +26,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_2(<vscale x 2 x ptr>
 ; No transform when the gather's passthrough cannot be reused or altered.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_3(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_3(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[GATHER]], <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -39,7 +39,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_3(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_4(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_4(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -52,7 +52,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_4(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_5(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_5(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -65,7 +65,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_5(<vscale x 2 x ptr>
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_6(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %inv_mask, <vscale x 2 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_6(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK]], <vscale x 2 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[INV_MASK]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -78,7 +78,7 @@ define <vscale x 2 x i32> @masked_gather_and_zero_inactive_6(<vscale x 2 x ptr>
 ; No transform when select and gather masks have no relation.
 define <vscale x 2 x i32> @masked_gather_and_zero_inactive_7(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask1, <vscale x 2 x i1> %mask2) {
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_7(
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[MASK1:%.*]], <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[MASK1:%.*]], <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <vscale x 2 x i1> [[MASK2:%.*]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[GATHER]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[MASKED]]
 ;
@@ -93,7 +93,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_8(<vscale x 2 x ptr
 ; CHECK-LABEL: @masked_gather_and_zero_inactive_8(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <vscale x 2 x i1> [[INV_MASK:%.*]], splat (i1 true)
 ; CHECK-NEXT:    [[PG:%.*]] = and <vscale x 2 x i1> [[COND:%.*]], [[MASK]]
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 4, <vscale x 2 x i1> [[PG]], <vscale x 2 x float> zeroinitializer)
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTR:%.*]], <vscale x 2 x i1> [[PG]], <vscale x 2 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[GATHER]]
 ;
   %mask = xor <vscale x 2 x i1> %inv_mask, splat (i1 true)
@@ -106,7 +106,7 @@ define <vscale x 2 x float> @masked_gather_and_zero_inactive_8(<vscale x 2 x ptr
 define <vscale x 2 x float> @masked_load_and_scalar_select_cond(<vscale x 2 x ptr> %ptr, <vscale x 2 x i1> %mask, i1 %cond) {
 ; CHECK-LABEL: @masked_load_and_scalar_select_cond(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> [[PTR:%.*]], i32 32, <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 32 [[PTR:%.*]], <vscale x 2 x i1> [[MASK:%.*]], <vscale x 2 x float> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <vscale x 2 x float> zeroinitializer, <vscale x 2 x float> [[TMP0]]
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-masked_load.ll b/llvm/test/Transforms/InstCombine/select-masked_load.ll
index 22e30ac0..f7584c4 100644
--- a/llvm/test/Transforms/InstCombine/select-masked_load.ll
+++ b/llvm/test/Transforms/InstCombine/select-masked_load.ll
@@ -4,7 +4,7 @@
 ; Fold zeroing of inactive lanes into the load's passthrough parameter.
 define <4 x float> @masked_load_and_zero_inactive_1(ptr %ptr, <4 x i1> %mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_1(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LOAD]]
 ;
   %load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x float> undef)
@@ -15,7 +15,7 @@ define <4 x float> @masked_load_and_zero_inactive_1(ptr %ptr, <4 x i1> %mask) {
 ; As above but reuse the load's existing passthrough.
 define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_2(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer)
@@ -26,7 +26,7 @@ define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) {
 ; No transform when the load's passthrough cannot be reused or altered.
 define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_3(
-; CHECK-NEXT:    [[MASKED:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[MASKED:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
   %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough)
@@ -38,7 +38,7 @@ define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x
 define <4 x i32> @masked_load_and_zero_inactive_4(ptr %ptr, <4 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_4(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -51,7 +51,7 @@ define <4 x i32> @masked_load_and_zero_inactive_4(ptr %ptr, <4 x i1> %inv_mask)
 define <4 x i32> @masked_load_and_zero_inactive_5(ptr %ptr, <4 x i1> %inv_mask) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_5(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x i32> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -64,7 +64,7 @@ define <4 x i32> @masked_load_and_zero_inactive_5(ptr %ptr, <4 x i1> %inv_mask)
 define <4 x i32> @masked_load_and_zero_inactive_6(ptr %ptr, <4 x i1> %inv_mask, <4 x i32> %passthrough) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_6(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK]], <4 x i32> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <4 x i1> [[INV_MASK]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]]
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
@@ -77,7 +77,7 @@ define <4 x i32> @masked_load_and_zero_inactive_6(ptr %ptr, <4 x i1> %inv_mask,
 ; No transform when select and load masks have no relation.
 define <4 x i32> @masked_load_and_zero_inactive_7(ptr %ptr, <4 x i1> %mask1, <4 x i1> %mask2) {
 ; CHECK-LABEL: @masked_load_and_zero_inactive_7(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK1:%.*]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[MASK1:%.*]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[MASKED:%.*]] = select <4 x i1> [[MASK2:%.*]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]]
 ; CHECK-NEXT:    ret <4 x i32> [[MASKED]]
 ;
@@ -92,7 +92,7 @@ define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask
 ; CHECK-LABEL: @masked_load_and_zero_inactive_8(
 ; CHECK-NEXT:    [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true)
 ; CHECK-NEXT:    [[PG:%.*]] = and <4 x i1> [[COND:%.*]], [[MASK]]
-; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[PG]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[PTR:%.*]], <4 x i1> [[PG]], <4 x float> zeroinitializer)
 ; CHECK-NEXT:    ret <4 x float> [[LOAD]]
 ;
   %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true>
@@ -105,7 +105,7 @@ define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask
 define <8 x float> @masked_load_and_scalar_select_cond(ptr %ptr, <8 x i1> %mask, i1 %cond) {
 ; CHECK-LABEL: @masked_load_and_scalar_select_cond(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[PTR:%.*]], i32 32, <8 x i1> [[MASK:%.*]], <8 x float> undef)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 32 [[PTR:%.*]], <8 x i1> [[MASK:%.*]], <8 x float> undef)
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <8 x float> zeroinitializer, <8 x float> [[TMP0]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
@@ -117,7 +117,7 @@ entry:
 
 define <vscale x 4 x float> @fold_sel_into_masked_load_scalable(ptr %loc, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @fold_sel_into_masked_load_scalable(
-; CHECK-NEXT:    [[SEL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[SEL:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
   %load = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %loc, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
@@ -127,7 +127,7 @@ define <vscale x 4 x float> @fold_sel_into_masked_load_scalable(ptr %loc, <vscal
 
 define <vscale x 4 x float> @neg_fold_sel_into_masked_load_mask_mismatch(ptr %loc, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask2, <vscale x 4 x float> %passthrough) {
 ; CHECK-LABEL: @neg_fold_sel_into_masked_load_mask_mismatch(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
 ; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 4 x i1> [[MASK2:%.*]], <vscale x 4 x float> [[LOAD]], <vscale x 4 x float> [[PASSTHROUGH]]
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
@@ -138,9 +138,9 @@ define <vscale x 4 x float> @neg_fold_sel_into_masked_load_mask_mismatch(ptr %lo
 
 define <vscale x 4 x float> @fold_sel_into_masked_load_scalable_one_use_check(ptr %loc1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough, ptr %loc2) {
 ; CHECK-LABEL: @fold_sel_into_masked_load_scalable_one_use_check(
-; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 1 [[LOC1:%.*]], <vscale x 4 x i1> [[MASK:%.*]], <vscale x 4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[LOAD]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD]], ptr [[LOC2:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[LOAD]], ptr align 1 [[LOC2:%.*]], <vscale x 4 x i1> [[MASK]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[SEL]]
 ;
   %load = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %loc1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
diff --git a/llvm/test/Transforms/InstSimplify/freeze-noundef.ll b/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
index e6c2969..728e813 100644
--- a/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
+++ b/llvm/test/Transforms/InstSimplify/freeze-noundef.ll
@@ -156,3 +156,30 @@ define {i8, i32} @noundef_metadata2(ptr %p) {
   %v.fr = freeze {i8, i32} %v
   ret {i8, i32} %v.fr
 }
+
+; Splats have two poison values but only the poison-ness of the splatted value
+; matters.
+define <4 x i32> @splat(i32 noundef %x) {
+; CHECK-LABEL: @splat(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %ins = insertelement <4 x i32> poison, i32 %x, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> zeroinitializer
+  %splat.fr = freeze <4 x i32> %splat
+  ret <4 x i32> %splat.fr
+}
+
+define <4 x i32> @splat_poison_idx(i32 noundef %x) {
+; CHECK-LABEL: @splat_poison_idx(
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[SPLAT_FR:%.*]] = freeze <4 x i32> [[SPLAT]]
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT_FR]]
+;
+  %ins = insertelement <4 x i32> poison, i32 %x, i32 0
+  %splat = shufflevector <4 x i32> %ins, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  %splat.fr = freeze <4 x i32> %splat
+  ret <4 x i32> %splat.fr
+}
diff --git a/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
new file mode 100644
index 0000000..d06b520
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ptrtoaddr.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+; The ptrtoaddr folds are also valid for pointers that have external state.
+target datalayout = "pe1:64:64:64:32"
+
+@g = external global i8
+@g2 = external global i8
+
+@g.as1 = external addrspace(1) global i8
+@g2.as1 = external addrspace(1) global i8
+
+define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT:    ret i64 [[A]]
+;
+  %toptr = inttoptr i64 %a to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %toptr = inttoptr i32 %a to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i32 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT:    ret i32 -1
+;
+  %toptr = inttoptr i64 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT:    ret i32 65535
+;
+  %toptr = inttoptr i16 -1 to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT:    ret i64 1
+;
+  %toptr = getelementptr i8, ptr null, i64 1
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT:    ret i64 123
+;
+  %toptr = inttoptr i64 123 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT:    ret i64 4294967295
+;
+  %toptr = inttoptr i32 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT:    ret i64 -1
+;
+  %toptr = inttoptr i128 -1 to ptr
+  %toaddr = ptrtoaddr ptr %toptr to i64
+  ret i64 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_null() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_null() {
+; CHECK-NEXT:    ret i64 42
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr null, i64 42) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_null_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) null, i32 42) to i32
+  ret i32 %toaddr
+}
+
+define i64 @ptrtoaddr_gep_sub() {
+; CHECK-LABEL: define i64 @ptrtoaddr_gep_sub() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %toaddr = ptrtoaddr ptr getelementptr (i8, ptr @g, i64 sub (i64 0, i64 ptrtoaddr (ptr @g2 to i64))) to i64
+  ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_gep_sub_addrsize() {
+; CHECK-NEXT:    ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32), i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))
+;
+  %toaddr = ptrtoaddr ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 sub (i32 0, i32 ptrtoaddr (ptr addrspace(1) @g2.as1 to i32))) to i32
+  ret i32 %toaddr
+}
+
+; Don't fold inttoptr of ptrtoaddr away. inttoptr will pick a previously
+; exposed provenance, which is not necessarily that of @g (especially as
+; ptrtoaddr does not expose the provenance.)
+define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-LABEL: define ptr @inttoptr_of_ptrtoaddr() {
+; CHECK-NEXT:    ret ptr inttoptr (i64 ptrtoaddr (ptr @g to i64) to ptr)
+;
+  %toptr = inttoptr i64 ptrtoaddr (ptr @g to i64) to ptr
+  ret ptr %toptr
+}
+
+define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_unrelated() {
+; CHECK-NEXT:    ret i64 sub (i64 ptrtoaddr (ptr @g to i64), i64 ptrtoaddr (ptr @g2 to i64))
+;
+  %sub = sub i64 ptrtoaddr (ptr @g to i64), ptrtoaddr (ptr @g2 to i64)
+  ret i64 %sub
+}
+
+define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_consts_offset() {
+; CHECK-NEXT:    ret i64 42
+;
+  %sub = sub i64 ptrtoaddr (ptr getelementptr (i8, ptr @g, i64 42) to i64), ptrtoaddr (ptr @g to i64)
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
+; CHECK-NEXT:    ret i32 42
+;
+  %sub = sub i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), ptrtoaddr (ptr addrspace(1) @g.as1 to i32)
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 42
+;
+  %p2 = getelementptr inbounds i8, ptr %p, i64 42
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 42
+;
+  %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %sub = sub i32 %p2.addr, %p.addr
+  ret i32 %sub
+}
+
+define i64 @ptrtoaddr_of_ptradd_of_sub(i64 %x, ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_of_ptradd_of_sub(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 [[X]]
+;
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %x, %p.addr
+  %ptradd = getelementptr i8, ptr %p, i64 %sub
+  %ptradd.addr = ptrtoaddr ptr %ptradd to i64
+  ret i64 %ptradd.addr
+}
+
+define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(i32 %x, ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_of_ptradd_of_sub_addrsize(
+; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %x, %p.addr
+  %ptradd = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  %ptradd.addr = ptrtoaddr ptr addrspace(1) %ptradd to i32
+  ret i32 %ptradd.addr
+}
+
+define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(ptr %p, ptr %p2, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_unrelated_pointers(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[P2_ADDR:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[P2_ADDR]], [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[SUB]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr %p, i64 %sub
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %gep2 = getelementptr i8, ptr addrspace(1) %p, i32 %sub
+  ret ptr addrspace(1) %gep2
+}
+
+define ptr @gep_of_sub_ptrtoaddr_ashr(ptr %p, i64 %x) {
+; CHECK-LABEL: define ptr @gep_of_sub_ptrtoaddr_ashr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P]], i64 [[X]]
+; CHECK-NEXT:    ret ptr [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr %p, i64 %x
+  %gep1.addr = ptrtoaddr ptr %gep1 to i64
+  %p.addr = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %gep1.addr, %p.addr
+  %ashr = ashr i64 %sub, 1
+  %gep2 = getelementptr i16, ptr %p, i64 %ashr
+  ret ptr %gep2
+}
+
+define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(ptr addrspace(1) %p, i32 %x) {
+; CHECK-LABEL: define ptr addrspace(1) @gep_of_sub_ptrtoaddr_ashr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[P]], i32 [[X]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[GEP1]]
+;
+  %gep1 = getelementptr i8, ptr addrspace(1) %p, i32 %x
+  %gep1.addr = ptrtoaddr ptr addrspace(1) %gep1 to i32
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %sub = sub i32 %gep1.addr, %p.addr
+  %sdiv = sdiv i32 %sub, 3
+  %gep2 = getelementptr [3 x i8], ptr addrspace(1) %p, i32 %sdiv
+  ret ptr addrspace(1) %gep2
+}
+
+; Not folding this to inttoptr(123), as this may have different provenance from
+; %p, and the use of ptrtoaddr implies that the provenance of %p may not be
+; exposed, such that inttoptr cannot recover it.
+define ptr @gep_gep_neg_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_neg_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_NEG:%.*]] = sub i64 0, [[P_ADDR]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_NEG]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.neg = sub i64 0, %p.addr
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.neg
+  ret ptr %gep2
+}
+
+define ptr @gep_gep_inv_ptrtoaddr(ptr %p) {
+; CHECK-LABEL: define ptr @gep_gep_inv_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 123
+; CHECK-NEXT:    [[P_ADDR:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[P_ADDR_INV:%.*]] = xor i64 [[P_ADDR]], -1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 [[P_ADDR_INV]]
+; CHECK-NEXT:    ret ptr [[GEP2]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %p, i64 123
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p.addr.inv = xor i64 %p.addr, -1
+  %gep2 = getelementptr i8, ptr %gep1, i64 %p.addr.inv
+  ret ptr %gep2
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 22ab79d..0b8f676 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -191,7 +191,7 @@ define void @interleave_nxi8_factor2_masked_store_splatmask(ptr %ptr, <vscale x
 ; CHECK-NEXT:    [[INTERLEAVE:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[L]], <vscale x 16 x i8> [[R]])
 ; CHECK-NEXT:    [[MASK_INS:%.*]] = insertelement <vscale x 32 x i1> poison, i1 [[MASK]], i64 0
 ; CHECK-NEXT:    [[MASK_SPLAT:%.*]] = shufflevector <vscale x 32 x i1> [[MASK_INS]], <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVE]], ptr [[PTR]], i32 1, <vscale x 32 x i1> [[MASK_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVE]], ptr align 1 [[PTR]], <vscale x 32 x i1> [[MASK_SPLAT]])
 ; CHECK-NEXT:    ret void
 ;
   %interleave = tail call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %l, <vscale x 16 x i8> %r)
diff --git a/llvm/test/Transforms/LICM/licm-ci.ll b/llvm/test/Transforms/LICM/licm-ci.ll
index f16c67f..b818442 100644
--- a/llvm/test/Transforms/LICM/licm-ci.ll
+++ b/llvm/test/Transforms/LICM/licm-ci.ll
@@ -13,7 +13,7 @@ define i16 @test(ptr %in) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_LE:%.*]] = getelementptr <4 x i16>, ptr [[IN:%.*]], i32 [[I_LCSSA]]
-; CHECK-NEXT:    [[LOAD_LE:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[GEP_LE]], i32 2, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i16> <i16 0, i16 poison, i16 0, i16 poison>), !alias.scope [[META0:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[LOAD_LE:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[GEP_LE]], <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i16> <i16 0, i16 poison, i16 0, i16 poison>), !alias.scope [[META0:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[REDUCE_LE:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[LOAD_LE]])
 ; CHECK-NEXT:    ret i16 [[REDUCE_LE]]
 ;
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
index 3903776..39fb774 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
@@ -40,9 +40,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -135,9 +135,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -263,9 +263,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -356,9 +356,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -479,9 +479,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -583,9 +583,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -736,9 +736,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -835,9 +835,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; LOOP-DEL-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; LOOP-DEL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; LOOP-DEL-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; LOOP-DEL-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; LOOP-DEL-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; LOOP-DEL-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -967,9 +967,9 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
 ; CHECK-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; CHECK-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
index ddffd39..62d15b5 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll
@@ -39,14 +39,14 @@ define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_sta
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -195,14 +195,14 @@ define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_st
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PSEARCH]], <vscale x 8 x i1> [[SEARCH_MASKED]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 8 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 1 [[PNEEDLE]], <vscale x 8 x i1> [[NEEDLE_MASKED]], <vscale x 8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 8 x i16> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[NEEDLE0_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
@@ -351,14 +351,14 @@ define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -514,14 +514,14 @@ define ptr @ensure_not_found_successors_fixed(ptr %search_start, ptr %search_end
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -665,14 +665,14 @@ define ptr @ensure_not_found_successors_fixed2(ptr %search_start, ptr %search_en
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64
 ; CHECK-NEXT:    [[SEARCH_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]])
 ; CHECK-NEXT:    [[SEARCH_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[SEARCH_PRED]]
-; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[SEARCH_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PSEARCH]], <vscale x 16 x i1> [[SEARCH_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    br label %[[MATCH_CHECK_VEC:.*]]
 ; CHECK:       [[MATCH_CHECK_VEC]]:
 ; CHECK-NEXT:    [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64
 ; CHECK-NEXT:    [[NEEDLE_PRED:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]])
 ; CHECK-NEXT:    [[NEEDLE_MASKED:%.*]] = and <vscale x 16 x i1> [[TMP0]], [[NEEDLE_PRED]]
-; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[NEEDLE_LOAD_VEC:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[PNEEDLE]], <vscale x 16 x i1> [[NEEDLE_MASKED]], <vscale x 16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[NEEDLE0:%.*]] = extractelement <vscale x 16 x i8> [[NEEDLE_LOAD_VEC]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[NEEDLE0]], i64 0
 ; CHECK-NEXT:    [[NEEDLE0_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[NEEDLE0_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
@@ -960,5 +960,6 @@ exit:
 }
 
 attributes #0 = { "target-features"="+sve2" }
-
+;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}
+;.
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
index 110b4a8..a82e5eb 100644
--- a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll
@@ -289,9 +289,9 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -656,9 +656,9 @@ define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1070,9 +1070,9 @@ define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i3
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1510,9 +1510,9 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP19]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP30:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP29:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP23:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP22]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP26:%.*]] = icmp ne <vscale x 16 x i8> [[TMP23]], [[TMP25]]
 ; MASKED-NEXT:    [[TMP27:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP26]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP28:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP27]])
@@ -1852,9 +1852,9 @@ define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) {
 ; MASKED-NEXT:    [[MISMATCH_VEC_LOOP_PRED:%.*]] = phi <vscale x 16 x i1> [ [[TMP15]], [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP26:%.*]], [[MISMATCH_VEC_LOOP_INC:%.*]] ]
 ; MASKED-NEXT:    [[MISMATCH_VEC_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VEC_LOOP_PREHEADER]] ], [ [[TMP25:%.*]], [[MISMATCH_VEC_LOOP_INC]] ]
 ; MASKED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VEC_INDEX]]
-; MASKED-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
+; MASKED-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i8> zeroinitializer)
 ; MASKED-NEXT:    [[TMP22:%.*]] = icmp ne <vscale x 16 x i8> [[TMP19]], [[TMP21]]
 ; MASKED-NEXT:    [[TMP23:%.*]] = select <vscale x 16 x i1> [[MISMATCH_VEC_LOOP_PRED]], <vscale x 16 x i1> [[TMP22]], <vscale x 16 x i1> zeroinitializer
 ; MASKED-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> [[TMP23]])
diff --git a/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll
new file mode 100644
index 0000000..a5e3acc
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/lcssa-phi-outer-latch.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+
+; This test is checking that blocks outer.body and outer.latch, where outer.body is the exit
+; block of the inner loop and outer.latch the latch of the outer loop, correctly
+; deal with the phi-node use-def chain %new.cond.lcssa -> %old.cond.lcssa. What we expect
+; here is that block outer.latch does not contain a phi node, because it is a single input
+; phi in a non-exit block.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i16 @main(ptr %a) {
+; CHECK-LABEL: define i16 @main(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[INNER_HEADER_PREHEADER:.*]]
+; CHECK:       [[OUTER_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[OUTER_LATCH:.*]] ], [ 1, %[[OUTER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[INNER_HEADER_SPLIT:.*]]
+; CHECK:       [[INNER_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
+; CHECK:       [[INNER_HEADER]]:
+; CHECK-NEXT:    [[J:%.*]] = phi i16 [ [[TMP1:%.*]], %[[INNER_LATCH_SPLIT:.*]] ], [ 0, %[[INNER_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[OUTER_HEADER_PREHEADER]]
+; CHECK:       [[INNER_HEADER_SPLIT]]:
+; CHECK-NEXT:    [[ARRAYIDX_US_US:%.*]] = getelementptr i16, ptr [[A]], i16 [[J]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX_US_US]], align 1
+; CHECK-NEXT:    [[COND:%.*]] = select i1 false, i16 0, i16 0
+; CHECK-NEXT:    br label %[[INNER_LATCH:.*]]
+; CHECK:       [[INNER_LATCH]]:
+; CHECK-NEXT:    [[J_NEXT:%.*]] = add i16 [[J]], 1
+; CHECK-NEXT:    br label %[[OUTER_BODY:.*]]
+; CHECK:       [[INNER_LATCH_SPLIT]]:
+; CHECK-NEXT:    [[NEW_COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[TMP1]] = add i16 [[J]], 1
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[INNER_HEADER]]
+; CHECK:       [[OUTER_BODY]]:
+; CHECK-NEXT:    br label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; CHECK-NEXT:    [[CMP286_US:%.*]] = icmp ugt i64 [[I]], 0
+; CHECK-NEXT:    br i1 [[CMP286_US]], label %[[OUTER_HEADER]], label %[[INNER_LATCH_SPLIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OLD_COND_LCSSA_LCSSA:%.*]] = phi i16 [ [[NEW_COND_LCSSA]], %[[INNER_LATCH_SPLIT]] ]
+; CHECK-NEXT:    ret i16 [[OLD_COND_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %outer.latch ]
+  br label %inner.header
+
+inner.header:
+  %j = phi i16 [ 0, %outer.header ], [ %j.next, %inner.latch ]
+  %arrayidx.us.us = getelementptr i16, ptr %a, i16 %j
+  %0 = load i16, ptr %arrayidx.us.us, align 1
+  %cond = select i1 false, i16 0, i16 0
+  br label %inner.latch
+
+inner.latch:
+  %j.next = add i16 %j, 1
+  br i1 true, label %outer.body, label %inner.header
+
+outer.body:
+  %new.cond.lcssa = phi i16 [ %cond, %inner.latch ]
+  br label %outer.latch
+
+outer.latch:
+  %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %outer.body ]
+  %i.next = add i64 %i, 1
+  %cmp286.us = icmp ugt i64 %i, 0
+  br i1 %cmp286.us, label %outer.header, label %exit
+
+exit:
+  ret i16 %old.cond.lcssa
+}
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
index 3673b1b..214d6ce 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
@@ -27,10 +27,10 @@ define void @unroll_upper(ptr noundef %pSrc, ptr nocapture noundef writeonly %pD
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP37:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[AND]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP37]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[NEXT_GEP37]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[WIDE_MASKED_LOAD]], splat (i16 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr [[NEXT_GEP]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr align 1 [[NEXT_GEP]], <8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END12_LOOPEXIT:%.*]], label [[VECTOR_BODY]]
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 2b5960e..90d0db2 100644
--- a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -56,7 +56,7 @@ if.then46:                                        ; preds = %for.body40
   br label %for.inc50
 
 for.inc50:                                        ; preds = %if.then46, %for.body40
-  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+  %k.1 = phi i32 [ 0, %for.body40 ], [ %inc47, %if.then46 ]
   %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
   %indvars.iv.next124 = add i64 %indvars.iv123, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 6e3d257..0415b01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -27,7 +27,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
@@ -89,7 +89,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP11:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 8 x i64> [[TMP11]] to <vscale x 8 x i8>
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[NEXT_GEP]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr align 1 [[NEXT_GEP]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 6fe6883..f163517 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -470,7 +470,7 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META15]], !noalias [[META17]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE37]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE37]]:
-; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
+; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP16]], <4 x i1> [[TMP8]]), !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -579,7 +579,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
 ; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
 ; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
-; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[NEXT_GEP]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr align 8 [[NEXT_GEP]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -1326,10 +1326,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x float>
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP15:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
 ; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
 ; PRED-NEXT:    [[TMP17:%.*]] = xor <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
@@ -1344,7 +1344,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> [[TMP24]], <vscale x 16 x float> splat (float 3.000000e+00), <vscale x 16 x float> [[TMP13]])
 ; PRED-NEXT:    [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP27]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; PRED-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index d10a26d..72e813b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -139,7 +139,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP33]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[TMP23]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> zeroinitializer, ptr align 8 [[TMP34]], <vscale x 2 x i1> [[TMP23]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -241,12 +241,12 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = urem i64 [[INDEX]], [[MUL_2_I]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = udiv i64 [[TMP26]], [[X]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = urem i64 [[TMP26]], [[X]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 2 x i64> [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = mul i64 [[X]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], [[X]]
@@ -256,7 +256,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; CHECK-NEXT:    [[TMP37:%.*]] = ashr i64 [[TMP36]], 32
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP37]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr [[TMP38]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP23]], ptr align 4 [[TMP38]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
index 71acac2..f78ce0a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
@@ -21,7 +21,7 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP7]], <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
@@ -54,12 +54,12 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
 ; CHECK:       [[PRED_LOAD_CONTINUE6]]:
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP7]], <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr align 4 [[TMP32]], <4 x i1> [[TMP30]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 307d4c4..d23e3c2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -27,15 +27,15 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP10]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
@@ -61,8 +61,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP8]], i32 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP22]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    store <2 x i8> zeroinitializer, ptr [[NEXT_GEP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
index 4bb8a0e..e57b28f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
@@ -17,6 +17,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
@@ -59,9 +60,8 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP24]] = phi <4 x i16> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP24]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sext <4 x i16> [[TMP25]] to <4 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP27]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP26]], ptr align 4 [[TMP28]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX_NEXT]], i64 1002)
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index a3a0c3e7..16e9d410 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -70,7 +70,7 @@ define void @PR34711(ptr %a, ptr %b, ptr %c, i64 %n) #0 {
 ; CHECK-VF4UF1-LABEL: @PR34711
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> {{.*}}, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
 ; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
 ; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
index 7548bf6..faee4c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
@@ -24,7 +24,7 @@ define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly
 ; CHECK-UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0
 ; CHECK-UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-UF1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-UF1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -59,10 +59,10 @@ define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly
 ; CHECK-UF4-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4
 ; CHECK-UF4-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 8
 ; CHECK-UF4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 12
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP19]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP8]], <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP17]], <4 x i1> [[ACTIVE_LANE_MASK4]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP18]], <4 x i1> [[ACTIVE_LANE_MASK5]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP19]], <4 x i1> [[ACTIVE_LANE_MASK6]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-UF4-NEXT:    [[TMP12]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 12)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
index 4ddf51b..26a9545 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll
@@ -88,7 +88,7 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur
 ; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP5]], align 4
 ; SVE-NEXT:    [[TMP7:%.*]] = sext <vscale x 2 x i32> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], <vscale x 2 x i64> [[TMP7]]
-; SVE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> poison)
+; SVE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x double> poison)
 ; SVE-NEXT:    [[TMP9]] = fadd <vscale x 2 x double> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
 ; SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; SVE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
index fd6e275..cfc6cc8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
@@ -148,14 +148,14 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; PRED-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
 ; PRED-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i16> [[TMP13]], [[TMP11]]
 ; PRED-NEXT:    [[TMP15:%.*]] = or <vscale x 16 x i16> [[TMP14]], [[TMP13]]
 ; PRED-NEXT:    [[TMP16:%.*]] = lshr <vscale x 16 x i16> [[TMP15]], splat (i16 1)
 ; PRED-NEXT:    [[TMP17:%.*]] = trunc <vscale x 16 x i16> [[TMP16]] to <vscale x 16 x i8>
 ; PRED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP17]], ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP17]], ptr align 1 [[TMP18]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP10]])
 ; PRED-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
index e3e4833..5d80353 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
@@ -69,7 +69,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 [[TMP28]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
@@ -200,7 +200,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi <vscale x 2 x i64> [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> align 1 [[TMP28]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
 ; CHECK-NEXT:    [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
 ; CHECK-NEXT:    [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
@@ -284,17 +284,17 @@ define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[GEP_J]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <4 x i64> [[STRIDED_VEC]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP6]], align 2
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i16 [[TMP12]], ptr [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    store i64 0, ptr [[B]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
index 803ffa8..cc3b1c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
@@ -26,7 +26,7 @@ define void @replicate_sdiv_conditional(ptr noalias %a, ptr noalias %b, ptr noal
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_LOAD]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 75b18ff..cf45f3a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -403,9 +403,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index d8a81f9..31453e9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -37,7 +37,7 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -56,10 +56,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -85,16 +85,16 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr align 8 [[TMP13]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr align 8 [[TMP16]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
@@ -160,7 +160,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP12]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END]]
 ; TFNONE:       [[IF_THEN]]:
-; TFNONE-NEXT:    [[TMP9:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR4]]
+; TFNONE-NEXT:    [[TMP9:%.*]] = call i64 @foo(i64 [[TMP12]]) #[[ATTR3]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_END]]:
 ; TFNONE-NEXT:    [[TMP14:%.*]] = phi i64 [ [[TMP9]], %[[IF_THEN]] ], [ 0, %[[FOR_BODY]] ]
@@ -182,13 +182,13 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> zeroinitializer
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -214,8 +214,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
@@ -228,8 +228,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nuw i64 [[TMP18]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr align 8 [[TMP20]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = shl nuw i64 [[TMP21]], 1
@@ -308,10 +308,10 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[TMP13]], 50
 ; TFNONE-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
 ; TFNONE:       [[IF_THEN]]:
-; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR5:[0-9]+]]
+; TFNONE-NEXT:    [[TMP14:%.*]] = call i64 @foo(i64 [[TMP13]]) #[[ATTR4:[0-9]+]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_ELSE]]:
-; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR5]]
+; TFNONE-NEXT:    [[TMP15:%.*]] = call i64 @foo(i64 0) #[[ATTR4]]
 ; TFNONE-NEXT:    br label %[[IF_END]]
 ; TFNONE:       [[IF_END]]:
 ; TFNONE-NEXT:    [[TMP16:%.*]] = phi i64 [ [[TMP14]], %[[IF_THEN]] ], [ [[TMP15]], %[[IF_ELSE]] ]
@@ -333,7 +333,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[TMP6]], splat (i1 true)
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
@@ -342,7 +342,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP10]])
 ; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP9]]
 ; TFCOMMON-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -368,8 +368,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], splat (i64 50)
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = xor <vscale x 2 x i1> [[TMP11]], splat (i1 true)
@@ -388,8 +388,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[TMP23]], i64 [[TMP25]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP23]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP26]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP23]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr align 8 [[TMP26]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 1
@@ -464,7 +464,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -481,7 +481,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -512,7 +512,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N_VEC]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -529,7 +529,7 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFA_INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFA_INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -588,7 +588,7 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7:[0-9]+]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -607,10 +607,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -629,10 +629,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -658,16 +658,16 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP11]], ptr align 8 [[TMP13]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP12]], ptr align 8 [[TMP16]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
@@ -739,7 +739,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[MULADD]] = tail call double @llvm.fmuladd.f64(double [[LOAD]], double [[M]], double [[FMA_SUM]])
 ; TFNONE-NEXT:    [[TOINT:%.*]] = fptoui double [[LOAD]] to i64
-; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR4]]
+; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
 ; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -762,12 +762,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFALWAYS-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
@@ -791,12 +791,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFFALLBACK-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
@@ -827,8 +827,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 [[TMP9]]
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD3]], [[BROADCAST_SPLAT]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
@@ -839,8 +839,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
 ; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = shl nuw i64 [[TMP18]], 1
 ; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP17]], i64 [[TMP19]]
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr [[TMP17]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr [[TMP20]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP16]], ptr align 8 [[TMP20]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> splat (double -0.000000e+00)
 ; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP21]])
 ; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> [[TMP12]], <vscale x 2 x double> splat (double -0.000000e+00)
@@ -912,7 +912,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFNONE:       [[LOOP]]:
 ; TFNONE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_END:.*]] ]
 ; TFNONE-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
-; TFNONE-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR8:[0-9]+]]
+; TFNONE-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
 ; TFNONE-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
 ; TFNONE-NEXT:    br i1 [[COND1]], label %[[LOOP_MIDDLE:.*]], label %[[LOOP_END]]
 ; TFNONE:       [[LOOP_MIDDLE]]:
@@ -933,7 +933,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFCOMMON:       [[LOOP]]:
 ; TFCOMMON-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; TFCOMMON-NEXT:    [[LD:%.*]] = load double, ptr [[P2]], align 8
-; TFCOMMON-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR7:[0-9]+]]
+; TFCOMMON-NEXT:    [[EXP:%.*]] = tail call double @llvm.exp.f64(double [[LD]]) #[[ATTR6:[0-9]+]]
 ; TFCOMMON-NEXT:    [[COND1:%.*]] = fcmp ogt double [[EXP]], 0.000000e+00
 ; TFCOMMON-NEXT:    [[SINK:%.*]] = select i1 [[COND1]], double 0.000000e+00, double 1.000000e+00
 ; TFCOMMON-NEXT:    store double [[SINK]], ptr [[P]], align 8
@@ -958,7 +958,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[TMP9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], %[[TMP9]] ]
 ; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[P2]], align 8
-; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR6:[0-9]+]]
 ; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00
 ; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index 300f5d9..5a2eee0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -203,8 +203,8 @@ define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly
 ; CHECK-ARMPL:    [[TMP15:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
 ; CHECK-ARMPL:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 0
 ; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 1
-; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr [[TMP19:%.*]], i32 4, <vscale x 4 x i1> [[TMP14:%.*]])
-; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP21:%.*]], i32 4, <vscale x 4 x i1> [[TMP14]])
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP13:%.*]], <vscale x 4 x i1> [[TMP9:%.*]])
+; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP14:%.*]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-ARMPL:  [[IF_MERGE:.*:]]
 ; CHECK-ARMPL:  [[FOR_END:.*:]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 56ace54..f50d083 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -472,7 +472,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; DEFAULT-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -522,7 +522,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; OPTSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; OPTSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; OPTSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; OPTSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; OPTSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -572,7 +572,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i8> [[TMP19]], [[BROADCAST_SPLAT4]]
 ; MINSIZE-NEXT:    [[TMP21:%.*]] = add <vscale x 16 x i8> [[TMP18]], [[TMP20]]
 ; MINSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; MINSIZE-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP21]], ptr align 1 [[TMP22]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; MINSIZE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; MINSIZE-NEXT:    [[TMP24:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index 50df6fc..5b61fba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -28,13 +28,13 @@ define void @foo() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    br label [[INNER_LOOP1:%.*]]
 ; CHECK:       inner_loop1:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP10]] = fmul <vscale x 4 x float> [[TMP8]], [[WIDE_MASKED_GATHER2]]
 ; CHECK-NEXT:    [[TMP11]] = add nuw nsw <vscale x 4 x i64> [[TMP7]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i64> [[TMP11]], splat (i64 512)
@@ -42,7 +42,7 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <vscale x 4 x float> [ [[TMP10]], [[INNER_LOOP1]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP14]], <vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
index 604e5b3..6d0777e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -38,13 +38,13 @@ define void @foo_i32(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -100,13 +100,13 @@ define void @foo_i64(i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i64], ptr @arrX, i64 0, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> [[TMP0]], i32 4, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VEC_IND]], <2 x ptr> align 4 [[TMP0]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [8 x [8 x i64]], ptr @arrY, i64 0, <2 x i64> [[VEC_PHI]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[TMP1]], <2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[TMP1]], <2 x ptr> align 4 [[TMP2]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <2 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i64> [[TMP3]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 5ae0839..3dfa6df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1361,132 +1361,6 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
 }
 
-define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEON-NEXT:  entry:
-; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-NEON:       vector.ph:
-; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
-; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-NEON:       vector.body:
-; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEON-NEXT:    [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEON-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NEON:       middle.block:
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON:       scalar.ph:
-;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-NEXT:  entry:
-; CHECK-SVE-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE:       vector.body:
-; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT:    [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE:       scalar.ph:
-;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-MAXBW-NEXT:  entry:
-; CHECK-SVE-MAXBW-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
-; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE-MAXBW:       vector.ph:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE-MAXBW:       vector.body:
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE-MAXBW:       middle.block:
-; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW:       scalar.ph:
-;
-entry:
-  br label %loop
-
-loop:
-  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
-  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
-  %l = load i8, ptr %ptr.iv, align 1
-  %l.ext = zext i8 %l to i32
-  %add = add i32 %red, %l.ext
-  %red.next = add i32 %add, %offset
-  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
-  %ec = icmp eq ptr %ptr.iv, %end
-  br i1 %ec, label %exit, label %loop
-
-exit:
-  ret i32 %red.next
-}
-
 attributes #0 = { vscale_range(1,16) }
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 56a5663..1ace7d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -1302,10 +1302,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1340,10 +1340,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP11]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
@@ -1378,10 +1378,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP11]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP16]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP17]], <vscale x 16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
new file mode 100644
index 0000000..d80178fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s --check-prefixes=CHECK-NEON
+
+target triple = "arm64-apple-macosx"
+
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEON-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEON:       [[VECTOR_PH]]:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NEON:       [[VECTOR_BODY]]:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEON-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON:       [[MIDDLE_BLOCK]]:
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEON:       [[SCALAR_PH]]:
+; CHECK-NEON-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEON-NEXT:    br label %[[LOOP:.*]]
+; CHECK-NEON:       [[LOOP]]:
+; CHECK-NEON-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; CHECK-NEON-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEON-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[L_EXT]]
+; CHECK-NEON-NEXT:    [[RED_NEXT]] = add i32 [[ADD]], [[OFFSET]]
+; CHECK-NEON-NEXT:    [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; CHECK-NEON-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEON-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEON:       [[EXIT]]:
+; CHECK-NEON-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %l = load i8, ptr %ptr.iv, align 1
+  %l.ext = zext i8 %l to i32
+  %add = add i32 %red, %l.ext
+  %red.next = add i32 %add, %offset
+  %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 1e6bcb1..70532ad 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -494,7 +494,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -523,7 +523,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP6]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -552,7 +552,7 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP6]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i32> [[TMP8]], <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
@@ -1117,24 +1117,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVE1-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -1143,42 +1143,42 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVED-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_LOAD5]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <2 x i32> [[WIDE_LOAD6]] to <2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = add <2 x i64> [[VEC_PHI2]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add <2 x i64> [[VEC_PHI3]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX7:%.*]] = add <2 x i64> [[TMP10]], [[BIN_RDX]]
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX8:%.*]] = add <2 x i64> [[TMP11]], [[BIN_RDX7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX8]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX10:%.*]] = add <2 x i64> [[PARTIAL_REDUCE8]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX11:%.*]] = add <2 x i64> [[PARTIAL_REDUCE9]], [[BIN_RDX10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX11]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -1187,24 +1187,24 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-MAXBW-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) #[[ATTR2]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 2
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP2]] = add <2 x i64> [[VEC_PHI]], [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP1]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index 20b5364..ebf4a4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -9,9 +9,7 @@ define i32 @fn1() local_unnamed_addr #0 {
 ; We expect the backend to expand all reductions.
 ; CHECK: @llvm.vector.reduce
 entry:
-  %0 = load i32, ptr @b, align 4, !tbaa !1
-  %cmp40 = icmp sgt i32 %0, 0
-  br i1 %cmp40, label %for.body.lr.ph, label %for.end
+  br label %for.body.lr.ph
 
 for.body.lr.ph:                                   ; preds = %entry
   %1 = load ptr, ptr @a, align 8, !tbaa !5
@@ -21,8 +19,8 @@ for.body.lr.ph:                                   ; preds = %entry
 
 for.body:                                         ; preds = %for.body.lr.ph, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
-  %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
+  %d.043 = phi i16 [ 0, %for.body.lr.ph ], [ %.sink28, %for.body ]
+  %c.042 = phi i16 [ 0, %for.body.lr.ph ], [ %c.0., %for.body ]
   %arrayidx = getelementptr inbounds i16, ptr %1, i64 %indvars.iv
   %4 = load i16, ptr %arrayidx, align 2, !tbaa !7
   %cmp2 = icmp sgt i16 %c.042, %4
@@ -33,10 +31,8 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %cmp = icmp slt i64 %indvars.iv.next, %3
   br i1 %cmp, label %for.body, label %for.end
 
-for.end:                                          ; preds = %for.body, %entry
-  %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
-  %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
-  %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
+for.end:                                          ; preds = %for.body
+  %cmp26 = icmp sgt i16 %c.0., %.sink28
   %conv27 = zext i1 %cmp26 to i32
   ret i32 %conv27
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
index 44820e0..33ce300 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -18,7 +18,7 @@ define void @_Z1dv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
-; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
 ; CHECK-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
 ; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[F_0]], 65535
 ; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
@@ -50,7 +50,7 @@ entry:
 
 for.cond:                                         ; preds = %for.cond.cleanup, %entry
   %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
-  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+  %g.0 = phi i32 [ 0, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
   %cmp12 = icmp ult i32 %g.0, 4
   %conv = and i32 %f.0, 65535
   br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0f82de62..44ae175 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -105,16 +105,16 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-NEXT:    [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP11]], [[TMP34]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT]]
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; VSCALEFORTUNING2-NEXT:    [[TMP39:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], splat (i32 1)
 ; VSCALEFORTUNING2-NEXT:    [[TMP40:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], splat (i32 1)
 ; VSCALEFORTUNING2-NEXT:    [[TMP41:%.*]] = zext <vscale x 4 x i32> [[TMP39]] to <vscale x 4 x i64>
 ; VSCALEFORTUNING2-NEXT:    [[TMP42:%.*]] = zext <vscale x 4 x i32> [[TMP40]] to <vscale x 4 x i64>
 ; VSCALEFORTUNING2-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP41]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP42]]
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP43]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP44]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP43]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; VSCALEFORTUNING2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP44]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; VSCALEFORTUNING2-NEXT:    [[TMP45:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP46:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER8]], [[VEC_PHI5]]
 ; VSCALEFORTUNING2-NEXT:    [[TMP47]] = or <vscale x 4 x i32> [[TMP45]], [[WIDE_MASKED_GATHER9]]
@@ -225,11 +225,11 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED-NEXT:    [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP32]], splat (i32 2)
 ; PRED-NEXT:    [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP15]], [[TMP33]]
 ; PRED-NEXT:    [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[BROADCAST_SPLAT]]
-; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[DOTSPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; PRED-NEXT:    [[TMP36:%.*]] = lshr <vscale x 4 x i32> [[TMP35]], splat (i32 1)
 ; PRED-NEXT:    [[TMP37:%.*]] = zext <vscale x 4 x i32> [[TMP36]] to <vscale x 4 x i64>
 ; PRED-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP37]]
-; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP38]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; PRED-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP38]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; PRED-NEXT:    [[TMP39:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; PRED-NEXT:    [[TMP40:%.*]] = or <vscale x 4 x i32> [[TMP39]], [[WIDE_MASKED_GATHER7]]
 ; PRED-NEXT:    [[TMP41]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -451,7 +451,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP14]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr align 2 [[TMP14]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> poison)
 ; PRED-NEXT:    [[TMP20:%.*]] = udiv <vscale x 8 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; PRED-NEXT:    [[TMP21:%.*]] = or <vscale x 8 x i16> [[TMP20]], [[VEC_PHI]]
 ; PRED-NEXT:    [[TMP16]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> [[TMP21]], <vscale x 8 x i16> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index aff2c4c..7f34513 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -66,8 +66,9 @@ define void @replicating_load_used_as_store_addr_2(ptr noalias %invar.dst, ptr n
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -107,15 +108,15 @@ define void @replicating_load_used_as_store_addr_3(ptr noalias %src, ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
 ; CHECK-NEXT:    store i8 0, ptr [[TMP7]], align 1
-; CHECK-NEXT:    store i8 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP5]] to i8
 ; CHECK-NEXT:    store i8 [[TMP8]], ptr [[INVAR_DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -208,25 +209,25 @@ define void @test_load_gep_widen_induction(ptr noalias %dst, ptr noalias %dst2)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i128, ptr [[DST]], <2 x i64> [[STEP_ADD_3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
-; CHECK-NEXT:    store ptr null, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP2]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
 ; CHECK-NEXT:    store ptr null, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
 ; CHECK-NEXT:    store ptr null, ptr [[TMP11]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr ptr, ptr [[DST2]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr ptr, ptr [[TMP12]], i32 4
@@ -466,21 +467,21 @@ define void @test_prefer_vector_addressing(ptr %start, ptr %ms, ptr noalias %src
 ; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA14:![0-9]+]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA14]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[NEXT_GEP]], align 1, !tbaa [[LONG_LONG_TBAA12:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[NEXT_GEP3]], align 1, !tbaa [[LONG_LONG_TBAA12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[NEXT_GEP4]], align 1, !tbaa [[LONG_LONG_TBAA12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[NEXT_GEP5]], align 1, !tbaa [[LONG_LONG_TBAA12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA19:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA19]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA19]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA19]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP18]], align 4, !tbaa [[INT_TBAA17:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4, !tbaa [[INT_TBAA17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4, !tbaa [[INT_TBAA17]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP21]], align 4, !tbaa [[INT_TBAA17]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -581,10 +582,11 @@ define double @test_scalarization_cost_for_load_of_address(ptr %src.0, ptr %src.
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP9]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21]] = call double @llvm.vector.reduce.fadd.v2f64(double [[VEC_PHI]], <2 x double> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret double [[TMP21]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
index 92b2a44..977713d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
@@ -22,7 +22,7 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[TMP14]] = fadd fast float [[VEC_PHI]], [[TMP13]]
@@ -104,7 +104,7 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = select fast <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 0x47EFFFFFE0000000)
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP12]])
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 5072058..d844634 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -134,7 +134,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP7]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP7]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP8]])
 ; CHECK-ORDERED-TF-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
@@ -350,10 +350,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> [[WIDE_MASKED_LOAD]], <vscale x 8 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP23]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], <vscale x 8 x float> splat (float -0.000000e+00)
@@ -594,7 +594,7 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-ORDERED-TF-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP10]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP10]], <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_MASKED_VEC]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
@@ -811,9 +811,9 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP9]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = fadd <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[TMP10]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -904,7 +904,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-UNORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
 ; CHECK-UNORDERED-NEXT:    [[TMP5:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-UNORDERED-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
 ; CHECK-UNORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-UNORDERED-NEXT:    [[TMP7]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-UNORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -959,7 +959,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
 ; CHECK-ORDERED-NEXT:    [[TMP5:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-ORDERED-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP6]], i32 4, <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-ORDERED-NEXT:    [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
 ; CHECK-ORDERED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -1012,11 +1012,11 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-ORDERED-TF-NEXT:    [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-ORDERED-TF-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> [[WIDE_MASKED_LOAD1]], <vscale x 4 x float> splat (float 3.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[PREDPHI]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP11]])
@@ -1419,10 +1419,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 3
@@ -1433,10 +1433,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP23]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP26]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP29]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP32]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = fmul <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
@@ -1715,10 +1715,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[TMP21]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP13]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP16]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP19]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP22]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP13]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP16]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP19]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP22]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDEX]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 3
@@ -1729,10 +1729,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-ORDERED-TF-NEXT:    [[TMP31:%.*]] = mul nuw i64 [[TMP30]], 24
 ; CHECK-ORDERED-TF-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[TMP31]]
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP23]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP26]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP29]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
-; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr [[TMP32]], i32 4, <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP23]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP26]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP29]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 8 x float> poison)
+; CHECK-ORDERED-TF-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <vscale x 8 x float> @llvm.masked.load.nxv8f32.p0(ptr align 4 [[TMP32]], <vscale x 8 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 8 x float> poison)
 ; CHECK-ORDERED-TF-NEXT:    [[TMP33:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP34:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]]
 ; CHECK-ORDERED-TF-NEXT:    [[TMP35:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD14]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
index 5f79d02..9a83169 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
@@ -24,14 +24,14 @@ define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP7]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP8]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP9]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP9]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP10]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP10]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -83,14 +83,14 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @scalable_vec_masked_bar(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP8]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP9]], ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -160,14 +160,14 @@ define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, pt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @scalable_vec_masked_foo(<vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP16]], 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr align 4 [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index c775b44..c1f0a35 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -81,7 +81,7 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
 ; PRED-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -209,7 +209,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    [[TMP8:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT3]] to <vscale x 2 x i8>
 ; PRED-NEXT:    [[TMP9:%.*]] = and <vscale x 2 x i8> [[TMP8]], [[TMP11]]
 ; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr [[TMP5]], i32 1, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr align 1 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000)
 ; PRED-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 495f9c0..3f32bbe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -22,10 +22,10 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,9 +80,9 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x double> [[WIDE_LOAD]], splat (double 4.000000e-01)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 8, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], i32 8, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP7]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -138,11 +138,11 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index d336f5f..d1b1771 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -20,7 +20,7 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -74,7 +74,7 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -123,7 +123,7 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> splat (i32 3), <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> splat (i32 3), <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -175,8 +175,8 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index 95836f8..3c04559 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -32,9 +32,9 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 4 x i64> [[VEC_IND]] to <vscale x 4 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index cde8976..786a2aa 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -115,10 +115,10 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
@@ -208,11 +208,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <vscale x 4 x i32> [[TMP10]] to <vscale x 4 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr @CD_i16, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> [[TMP12]], i32 2, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> align 2 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i32> [[TMP13]] to <vscale x 4 x i16>
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr @CD_i16, <vscale x 4 x i64> [[TMP9]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> [[TMP15]], i32 2, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> align 2 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -268,22 +268,22 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP5]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP5]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT6]], i64 8
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT7]], i64 12
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT8]], i64 16
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[DOTSPLIT9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT9]], i64 20
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]]
@@ -590,10 +590,10 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 8
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> [[TMP5]], i32 8, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> align 8 [[TMP5]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -728,8 +728,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -755,8 +755,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %p, i64 %indvars.iv, i32 0
   %load1 = load i32, ptr %a, align 4
   %add = add nsw i32 %load1, %SumA.013
@@ -812,14 +812,14 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -884,11 +884,11 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
@@ -962,11 +962,11 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1038,7 +1038,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
@@ -1120,8 +1120,8 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[P:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -1202,9 +1202,9 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP16]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1289,10 +1289,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP18]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
@@ -1373,26 +1373,26 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZ:%.*]], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP21]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 4
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = sub nsw <vscale x 4 x i32> [[TMP8]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 4
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP23]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP20]], i64 8
-; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP24]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP24]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl <vscale x 4 x i32> [[TMP9]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP10]], i64 8
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[TMP25]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> align 4 [[TMP25]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3b0bd87..02cc499 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -48,7 +48,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -58,7 +58,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -98,7 +98,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -108,7 +108,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP18]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -191,12 +191,12 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP6]], i32 1, <vscale x 16 x i1> splat (i1 true))
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true))
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -233,13 +233,13 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -321,12 +321,12 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP6]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP7]], i32 1, <vscale x 16 x i1> [[TMP5]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP11]], i32 1, <vscale x 16 x i1> [[TMP8]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP11]], <vscale x 16 x i1> [[TMP8]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -368,13 +368,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP13]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP15]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP14]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP17]], <vscale x 16 x i1> [[TMP14]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -467,7 +467,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -481,7 +481,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -521,7 +521,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP12]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP12]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -535,7 +535,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP19]], <vscale x 16 x i8> [[TMP20]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP22]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 4ae4935..9c3f3f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -70,7 +70,7 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
index a1a89ee..76cf7d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
@@ -7,9 +7,9 @@ define void @stride7_i32(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], splat (i64 7)
 ; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds i32, ptr %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %[[PTRS]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %[[PTRS]]
 ; CHECK-NEXT:   %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]],
-; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> %[[PTRS]]
+; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> align 4 %[[PTRS]]
 entry:
   br label %for.body
 
@@ -34,9 +34,9 @@ define void @stride7_f64(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], splat (i64 7)
 ; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]],
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]],
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
-; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]],
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]],
 entry:
   br label %for.body
 
@@ -61,9 +61,9 @@ define void @cond_stride7_f64(ptr noalias nocapture %dst, ptr noalias nocapture
 ; CHECK:      vector.body
 ; CHECK:        %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
 ; CHECK:        %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %{{.*}}
-; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]]
+; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]]
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
-; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> %[[PTRS]], i32 8, <vscale x 2 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index f03f743..8c62ffd9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -17,12 +17,12 @@ define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP6]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7)
 ; CHECK-NEXT:    [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
index 820fd88..54c0dfd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
@@ -7,9 +7,9 @@ define void @mloadstore_f32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x float>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
 ; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr float, ptr %a,
-; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
-; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 
@@ -42,9 +42,9 @@ define void @mloadstore_i32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x i32>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
 ; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr i32, ptr %a,
-; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
-; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr %[[GEPA]], i32 4, <vscale x 4 x i1> %[[MASK]])
+; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 1018bdd..c348dec 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -156,7 +156,7 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], splat (i32 35)
-; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
+; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 {{%.*}}, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], splat (i32 2)
 ; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
 ; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index b8b4fbd..8108320 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -61,7 +61,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
index cb2c003..243ea7c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -16,12 +16,12 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 2 x i64> [[WIDE_MASKED_LOAD1]], [[TMP10]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr align 8 [[TMP11]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index 33ee0d6..3b8625e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -23,10 +23,10 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP0]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP0]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP1]], ptr [[TMP2]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP1]], ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -83,10 +83,10 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP3]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index b5544dc..ae7c9d2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -26,7 +26,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP14]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -59,7 +59,7 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP15]] = add i32 [[VEC_PHI]], [[TMP14]]
@@ -110,7 +110,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -142,7 +142,7 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float -0.000000e+00)
 ; CHECK-IN-LOOP-NEXT:    [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
 ; CHECK-IN-LOOP-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
@@ -191,11 +191,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 7, i32 0), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP20]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[VEC_PHI]]
@@ -228,11 +228,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-IN-LOOP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 5)
 ; CHECK-IN-LOOP-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-IN-LOOP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-IN-LOOP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP19]] = xor i32 [[VEC_PHI]], [[TMP18]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 5531b3c..c8ecb7f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -49,10 +49,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP47]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP54]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP57]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP60]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]])
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP64:%.*]] = shl nuw i64 [[TMP63]], 2
@@ -135,10 +135,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP58:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP58]], 12
 ; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP47]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP47]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP54]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP57]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP60]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK9]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD11]], zeroinitializer
@@ -157,10 +157,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[TMP80:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP81:%.*]] = mul nuw i64 [[TMP80]], 12
 ; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP65]], i32 4, <vscale x 4 x i1> [[TMP69]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, <vscale x 4 x i1> [[TMP70]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, <vscale x 4 x i1> [[TMP71]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, <vscale x 4 x i1> [[TMP72]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP65]], <vscale x 4 x i1> [[TMP69]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP76]], <vscale x 4 x i1> [[TMP70]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP79]], <vscale x 4 x i1> [[TMP71]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP82]], <vscale x 4 x i1> [[TMP72]])
 ; CHECK-NEXT:    [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP85:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP86:%.*]] = shl nuw i64 [[TMP85]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 9ebe790..945d808 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -25,7 +25,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -69,7 +69,7 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP4]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -115,9 +115,9 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP13]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -175,9 +175,9 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -226,11 +226,11 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> align 4 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -284,7 +284,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -337,13 +337,13 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr align 4 [[TMP16]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -404,8 +404,8 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -454,10 +454,10 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -509,11 +509,11 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
index 9485d82..c3f7a25 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -19,12 +19,12 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 {
 ; CHECK-LABEL: vector.body:
 ; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
 ; CHECK: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
 ; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
 ; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
 ; CHECK: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
-; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
+; CHECK: call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %[[REVERSE8]], ptr align 8 %{{.*}}, <vscale x 4 x i1> %[[REVERSE9]]
 
 entry:
   %cmp7 = icmp sgt i64 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
index 7628b39..8b6c7fe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
@@ -14,12 +14,12 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP2]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -58,12 +58,12 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr [[TMP2]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 4 [[TMP2]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @bar_vector(<vscale x 2 x float> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD1]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
index 75acbea9..f2e3b70 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
@@ -23,10 +23,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP10]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK-UF1-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-UF1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr [[TMP13]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP6]], ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-UF1-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -68,10 +68,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP29:%.*]] = mul nuw i64 [[TMP34]], 48
 ; CHECK-UF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP29]]
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP33]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP30]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP20]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP24]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP33]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 16 x i8> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP30]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 16 x i8> poison)
 ; CHECK-UF4-NEXT:    [[TMP25:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-UF4-NEXT:    [[TMP26:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD9]], splat (i8 3)
 ; CHECK-UF4-NEXT:    [[TMP27:%.*]] = mul <vscale x 16 x i8> [[WIDE_MASKED_LOAD10]], splat (i8 3)
@@ -86,10 +86,10 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src,
 ; CHECK-UF4-NEXT:    [[TMP43:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP44:%.*]] = mul nuw i64 [[TMP43]], 48
 ; CHECK-UF4-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP44]]
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr [[TMP35]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP39]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr [[TMP42]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr [[TMP45]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP25]], ptr align 1 [[TMP35]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP39]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP27]], ptr align 1 [[TMP42]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP28]], ptr align 1 [[TMP45]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP62]]
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-UF4-NEXT:    [[TMP58]] = call <vscale x 16 x i1> @llvm.vector.extract.nxv16i1.nxv64i1(<vscale x 64 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 48)
@@ -141,10 +141,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-UF1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-UF1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-UF1-NEXT:    [[TMP3:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
 ; CHECK-UF1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr [[TMP8]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF1-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-UF1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP13]])
 ; CHECK-UF1-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -189,10 +189,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    [[TMP23:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 6
 ; CHECK-UF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP24]]
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP29]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
-; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP29]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP22]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]], <vscale x 2 x double> poison)
+; CHECK-UF4-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP25]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]], <vscale x 2 x double> poison)
 ; CHECK-UF4-NEXT:    [[TMP16:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00)
 ; CHECK-UF4-NEXT:    [[TMP17:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00)
 ; CHECK-UF4-NEXT:    [[TMP18:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00)
@@ -207,10 +207,10 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl
 ; CHECK-UF4-NEXT:    [[TMP38:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-UF4-NEXT:    [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 6
 ; CHECK-UF4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP39]]
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr [[TMP34]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr [[TMP37]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
-; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr [[TMP40]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP16]], ptr align 8 [[TMP30]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP17]], ptr align 8 [[TMP34]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK6]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP18]], ptr align 8 [[TMP37]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK7]])
+; CHECK-UF4-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP19]], ptr align 8 [[TMP40]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK8]])
 ; CHECK-UF4-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]]
 ; CHECK-UF4-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-UF4-NEXT:    [[TMP53]] = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1(<vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 6)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index ef111ca..f223786 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -35,11 +35,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 2 x ptr> [[VECTOR_GEP]], i64 1
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP9]], ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    store <vscale x 2 x i8> [[TMP12]], ptr [[TMP10]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index cf41664..3b2b0b5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -239,9 +239,9 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[TMP9]], splat (i64 2)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8
@@ -313,9 +313,9 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <vscale x 2 x i64> [[TMP4]], splat (i64 1)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], zeroinitializer
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr align 2 [[TMP7]], <vscale x 2 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 742097b..871d9be 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -552,7 +552,7 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
index 753847f..a14ea74 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
@@ -26,7 +26,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr align 4 [[TMP6]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[IDX]], 4
 ; CHECK-NEXT:    [[NEXT_ACTIVE_LANE_MASK]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[IDX]], i64 [[N2]])
 ; CHECK-NEXT:    [[EXTRACT_FIRST_LANE_MASK:%.*]] = extractelement <4 x i1> [[NEXT_ACTIVE_LANE_MASK]], i32 0
@@ -75,13 +75,13 @@ define void @cond_uniform_load(ptr noalias nocapture %dst, ptr nocapture readonl
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP6]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[COND_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[SRC_SPLAT]], <4 x i1> [[MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX6]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr align 4 [[TMP7]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX6]], 4
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX6]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 1607755..de70da6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -66,7 +66,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[UMAX]])
 ; DATA-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -101,7 +101,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT3]], [[TMP11]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[TMP12]])
+; DATA_NO_LANEMASK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT5]], ptr align 4 [[TMP13]], <vscale x 4 x i1> [[TMP12]])
 ; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
 ; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -125,7 +125,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP5]]
 ; DATA_AND_CONTROL-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]])
 ; DATA_AND_CONTROL-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
@@ -156,7 +156,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX1]]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP1]]
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
index dd63b5e..6d0c55b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops.ll
@@ -748,15 +748,15 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF2-NEXT:    [[TMP22:%.*]] = shufflevector <6 x i32> [[WIDE_VEC1]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
 ; VF2-NEXT:    [[TMP14:%.*]] = mul <2 x i32> [[TMP7]], [[TMP13]]
 ; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
-; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
+; VF2-NEXT:    store i32 [[TMP15]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    store i32 [[TMP16]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
 ; VF2-NEXT:    [[TMP23:%.*]] = mul <2 x i32> [[TMP7]], [[TMP22]]
 ; VF2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
-; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; VF2-NEXT:    store i32 [[TMP24]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    store i32 [[TMP25]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 98
@@ -789,12 +789,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP44:%.*]] = shufflevector <12 x i32> [[WIDE_VEC1]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; VF4-NEXT:    [[TMP28:%.*]] = mul <4 x i32> [[TMP15]], [[TMP27]]
 ; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i32 0
-; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
 ; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i32 1
-; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
 ; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i32 2
-; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i32 3
+; VF4-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 8
+; VF4-NEXT:    store i32 [[TMP30]], ptr [[TMP17]], align 8
+; VF4-NEXT:    store i32 [[TMP31]], ptr [[TMP18]], align 8
 ; VF4-NEXT:    store i32 [[TMP32]], ptr [[TMP19]], align 8
 ; VF4-NEXT:    [[TMP33:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP0]], i32 1
 ; VF4-NEXT:    [[TMP34:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP1]], i32 1
@@ -802,12 +802,12 @@ define void @test_2xi32(ptr noalias %data, ptr noalias %factor) {
 ; VF4-NEXT:    [[TMP36:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[TMP3]], i32 1
 ; VF4-NEXT:    [[TMP45:%.*]] = mul <4 x i32> [[TMP15]], [[TMP44]]
 ; VF4-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
 ; VF4-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
 ; VF4-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; VF4-NEXT:    store i32 [[TMP46]], ptr [[TMP33]], align 8
+; VF4-NEXT:    store i32 [[TMP47]], ptr [[TMP34]], align 8
+; VF4-NEXT:    store i32 [[TMP48]], ptr [[TMP35]], align 8
 ; VF4-NEXT:    store i32 [[TMP49]], ptr [[TMP36]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index 187edb5..4761cb0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -20,30 +20,30 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = ashr exact i64 [[TMP10]], 32
-; CHECK-NEXT:    [[TMP15:%.*]] = ashr exact i64 [[TMP11]], 32
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
-; CHECK-NEXT:    store i16 [[TMP22]], ptr [[TMP18]], align 2
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
-; CHECK-NEXT:    store i16 [[TMP23]], ptr [[TMP19]], align 2
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i16> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    store i16 [[TMP6]], ptr [[TMP22]], align 2
+; CHECK-NEXT:    store i16 [[TMP7]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,30 +95,30 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr exact i64 [[TMP12]], 32
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr exact i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr exact i64 [[TMP14]], 32
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr exact i64 [[TMP15]], 32
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
-; CHECK-NEXT:    store i16 [[TMP24]], ptr [[TMP20]], align 2
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
-; CHECK-NEXT:    store i16 [[TMP25]], ptr [[TMP21]], align 2
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
-; CHECK-NEXT:    store i16 [[TMP26]], ptr [[TMP22]], align 2
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
-; CHECK-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i16> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i16> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr exact i64 [[TMP16]], 32
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr exact i64 [[TMP17]], 32
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr exact i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr exact i64 [[TMP19]], 32
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[TMP23]]
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP24]], align 2
+; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP25]], align 2
+; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP26]], align 2
+; CHECK-NEXT:    store i16 [[TMP11]], ptr [[TMP27]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index e9de5e2..dcb8906 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -21,10 +21,10 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -53,16 +53,16 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr align 8 [[TMP14]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
@@ -107,10 +107,10 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
@@ -139,16 +139,16 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX:%.*]] = shl i64 [[TMP8]], 4
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[DOTIDX]]
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl i64 [[TMP13]], 4
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[DOTIDX5]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP10]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP11]], ptr align 8 [[TMP14]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 1
@@ -188,7 +188,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds nuw double, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
@@ -212,7 +212,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE:       pred.store.if:
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    store double [[TMP3]], ptr [[TMP4]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
@@ -222,7 +222,7 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[INDEX]], 1
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP6]], align 8
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR5]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call double @foo(double [[TMP7]], i64 [[TMP5]]) #[[ATTR4]]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    store double [[TMP8]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 1dd49ec..17be6cc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -46,13 +46,13 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison)
 ; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]], <4 x double> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], splat (double 1.000000e+00)
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index 96a25a8..5999707 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -116,11 +116,11 @@ define void @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call fast <2 x double> @__simd_sin_v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    store double [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 ; CHECK-NEXT:    store double [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 2adb3b5..fc0b19d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -54,8 +54,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
 ; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
-; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
 ; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
 ; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
index 58a24ee..583e156 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
@@ -23,14 +23,14 @@ define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP6]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr align 1 [[TMP6]], <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr align 1 [[TMP8]], <16 x i1> [[ACTIVE_LANE_MASK1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 44a48a9..0f398a6 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -84,15 +84,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -107,9 +106,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi i32 [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret i32 %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret i32 %add.lcssa
 }
 
 ; Floating-point loops need fast-math to be vectorizeable
@@ -121,15 +119,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; DARWIN: We can vectorize this loop!
 define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -144,9 +141,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
@@ -252,7 +248,7 @@ for.body.preheader:                               ; preds = %entry
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
   %0 = load i32, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
@@ -268,7 +264,7 @@ for.end.loopexit:                                 ; preds = %for.body
   br label %for.end
 
 for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  %Red.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
   ret i32 %Red.0.lcssa
 }
 
@@ -277,15 +273,14 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK: We can vectorize this loop!
 define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
 entry:
-  %cmp5 = icmp eq i32 %N, 0
-  br i1 %cmp5, label %for.end, label %for.body.preheader
+  br label %for.body.preheader
 
 for.body.preheader:                               ; preds = %entry
   br label %for.body
 
 for.body:                                         ; preds = %for.body.preheader, %for.body
   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ 0.0e+00, %for.body.preheader ]
   %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
@@ -300,9 +295,8 @@ for.end.loopexit:                                 ; preds = %for.body
   %add.lcssa = phi float [ %add, %for.body ]
   br label %for.end
 
-for.end:                                          ; preds = %for.end.loopexit, %entry
-  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
-  ret float %Red.0.lcssa
+for.end:                                          ; preds = %for.end.loopexit
+  ret float %add.lcssa
 }
 
 ; Make sure calls that turn into builtins are also covered
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index 9a76019..0b0e2d4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -17,10 +17,10 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -189,10 +189,10 @@ define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -236,10 +236,10 @@ define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -284,10 +284,10 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -346,7 +346,7 @@ define void @test_stride_noninvar_4i32(ptr readonly %data, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
@@ -465,7 +465,7 @@ define void @test_stride_noninvar3_4i32(ptr readonly %data, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], <4 x i32> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP6]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> splat (i32 5), [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
index be4a6be..2686f41 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
@@ -70,9 +70,9 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP11]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP12]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index fe3504b..23609b1 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -249,7 +249,7 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
   br label %7
 
 7:                                                ; preds = %5, %155
-  %8 = phi i32 [ %10, %155 ], [ undef, %5 ]
+  %8 = phi i32 [ %10, %155 ], [ 0, %5 ]
   %9 = phi i32 [ %156, %155 ], [ 0, %5 ]
   %10 = shl i32 %8, 4
   store i32 %10, ptr %6, align 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index 0a4ed7f..706012c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -15,7 +15,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i32 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -56,9 +56,9 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[VEC_PHI]], [[TMP3]]
@@ -108,9 +108,9 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
@@ -155,9 +155,9 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> splat (i32 -1)
 ; CHECK-NEXT:    [[TMP4]] = and <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -202,9 +202,9 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = or <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -249,9 +249,9 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = xor <4 x i32> [[VEC_PHI]], [[TMP3]]
@@ -296,9 +296,9 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
@@ -343,9 +343,9 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257)
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index 029d8bd..3fb645c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -21,9 +21,9 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -82,9 +82,9 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP4]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
@@ -143,7 +143,7 @@ define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
@@ -194,7 +194,7 @@ define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -245,7 +245,7 @@ define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -296,7 +296,7 @@ define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -347,7 +347,7 @@ define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -398,7 +398,7 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -449,7 +449,7 @@ define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float 1.000000e+00), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 1540baa..d9c4414 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -252,7 +252,7 @@ define i32 @add_i32_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP0]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i32 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -296,7 +296,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
@@ -343,7 +343,7 @@ define i32 @add_i8_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
@@ -390,7 +390,7 @@ define signext i16 @add_i16_i16(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i16 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -434,7 +434,7 @@ define signext i16 @add_i8_i16(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP1]], <16 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP2]])
@@ -481,7 +481,7 @@ define zeroext i8 @add_i8_i8(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]])
 ; CHECK-NEXT:    [[TMP2]] = add i8 [[VEC_PHI]], [[TMP1]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
@@ -801,9 +801,9 @@ define i32 @mla_i32_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP7]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
@@ -852,9 +852,9 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
@@ -907,9 +907,9 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP2]], [[TMP3]]
@@ -962,9 +962,9 @@ define signext i16 @mla_i16_i16(ptr nocapture readonly %x, ptr nocapture readonl
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP1]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP7]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP7]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i16> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]])
@@ -1013,9 +1013,9 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]]
@@ -1068,9 +1068,9 @@ define zeroext i8 @mla_i8_i8(ptr nocapture readonly %x, ptr nocapture readonly %
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Y1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP7]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[TMP2]], <16 x i8> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]])
@@ -1119,9 +1119,9 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr align 1 [[TMP0]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[TMP2]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]]
@@ -1267,10 +1267,10 @@ define i32 @red_mla_u8_s8_u32(ptr noalias nocapture readonly %A, ptr noalias noc
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[B1:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr [[TMP9]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr align 1 [[TMP9]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
@@ -1407,7 +1407,7 @@ define i32 @mla_i8_i32_multiuse(ptr nocapture readonly %x, ptr nocapture readonl
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP0]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
index d930a96..a976016 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
@@ -35,10 +35,10 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[VECTOR_GEP4]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP1]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP2]], <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 19595)
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER5]] to <4 x i32>
@@ -67,10 +67,10 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> align 1 [[VECTOR_GEP]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> [[TMP30]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> [[TMP31]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> align 1 [[TMP30]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> align 1 [[TMP31]], <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND3]] = getelementptr i8, ptr [[POINTER_PHI2]], i32 12
 ; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
index 04a97f4..b49377c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll
@@ -7,10 +7,6 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; CHECK-COST-LABEL: arm_offset_q15
-; CHECK-COST: LV: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Cost of 36 for VF 2: REPLICATE ir<%1> = call @llvm.sadd.sat.i16(ir<%0>, ir<%offset>)
-; CHECK-COST: Cost of 8 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
-; CHECK-COST: Cost of 2 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
 
 define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @arm_offset_q15(
@@ -30,9 +26,9 @@ define void @arm_offset_q15(ptr nocapture readonly %pSrc, i16 signext %offset, p
 ; CHECK-NEXT:    [[OFFSET_IDX5:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[OFFSET_IDX5]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr align 2 [[NEXT_GEP]], <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[TMP0]], ptr [[NEXT_GEP6]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i16.p0(<8 x i16> [[TMP0]], ptr align 2 [[NEXT_GEP6]], <8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index e881125..df648c9 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -111,7 +111,7 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -544,7 +544,7 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 12, i32 24, i32 36>
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x float> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -749,8 +749,8 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, <4 x ptr> [[TMP0]], i32 96
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
@@ -816,10 +816,10 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP0]], i32 288
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i32 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
@@ -886,18 +886,18 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP0]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP2]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> align 1 [[TMP3]], <4 x i1> splat (i1 true), <4 x i8> poison), !alias.scope [[META28]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], splat (i8 10)
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> align 1 [[TMP1]], <4 x i1> splat (i1 true)), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> align 1 [[TMP7]], <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> align 1 [[TMP8]], <4 x i1> splat (i1 true)), !alias.scope [[META31]], !noalias [[META28]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
 ; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI5]], i32 12
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 0f52456..27946ef 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -19,9 +19,9 @@ define dso_local void @sgt_loopguard(ptr noalias nocapture readonly %a, ptr noal
 
 ; CHECK-TF:     %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
 ; CHECK-TF:     %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N)
-; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
+; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.load.v16i8.p0(ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr align 1 %{{.*}}, <16 x i1> %active.lane.mask)
 entry:
   %cmp5 = icmp sgt i32 %N, 0
   br i1 %cmp5, label %while.body.preheader, label %while.end
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index 956b575..4af40b7 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -504,7 +504,7 @@ define dso_local void @select_not_allowed(ptr noalias nocapture %A, ptr noalias
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, <4 x ptr> [[TMP4]], <4 x i32> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP5]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 10f8f74..f25b86d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -102,7 +102,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i32> [[VEC_IND]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> [[TMP16]], i32 1, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i8.nxv4p0(<vscale x 4 x i8> zeroinitializer, <vscale x 4 x ptr> align 1 [[TMP16]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -224,7 +224,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_CONTINUE18]]:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META13]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP11]], <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META13]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index ef0f0cf..5c78cfd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -32,7 +32,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -42,7 +42,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -79,7 +79,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP7]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_DATA-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -89,7 +89,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP12]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -206,7 +206,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -220,7 +220,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; SCALAR_EPILOGUE-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
 ; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -257,7 +257,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; PREDICATED_DATA-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP7]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_DATA-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; PREDICATED_DATA-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_DATA-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_DATA-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
@@ -271,7 +271,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
 ; PREDICATED_DATA-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_DATA-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -324,7 +324,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PREDICATED_DATA-WITH-EVL:       middle.block:
 ; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_DATA-WITH-EVL:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
index c5396f2..f36919f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-store-with-gap.ll
@@ -20,7 +20,7 @@ define void @store_factor_2_with_tail_gap(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i64> [[VEC_IND]], <16 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], i32 8, <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
+; CHECK-NEXT:    call void @llvm.masked.store.v32i64.p0(<32 x i64> [[INTERLEAVED_VEC]], ptr align 8 [[TMP1]], <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 5ca9bfd..f2f6568 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -870,17 +870,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV64-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV64-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
 ; RV64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV64-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV64-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV64-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV64-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV64-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV64-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV64-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV64-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV64-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV64-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV64-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -921,17 +921,17 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP18:%.*]] = insertelement <4 x i7> [[TMP17]], i7 [[TMP14]], i32 2
 ; RV32-NEXT:    [[TMP19:%.*]] = insertelement <4 x i7> [[TMP18]], i7 [[TMP15]], i32 3
 ; RV32-NEXT:    [[TMP20:%.*]] = add <4 x i7> [[TMP19]], splat (i7 1)
+; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
+; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
+; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
+; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP4]]
 ; RV32-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP5]]
 ; RV32-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP6]]
 ; RV32-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP7]]
-; RV32-NEXT:    [[TMP25:%.*]] = extractelement <4 x i7> [[TMP20]], i32 0
 ; RV32-NEXT:    store i7 [[TMP25]], ptr [[TMP21]], align 1
-; RV32-NEXT:    [[TMP26:%.*]] = extractelement <4 x i7> [[TMP20]], i32 1
 ; RV32-NEXT:    store i7 [[TMP26]], ptr [[TMP22]], align 1
-; RV32-NEXT:    [[TMP27:%.*]] = extractelement <4 x i7> [[TMP20]], i32 2
 ; RV32-NEXT:    store i7 [[TMP27]], ptr [[TMP23]], align 1
-; RV32-NEXT:    [[TMP28:%.*]] = extractelement <4 x i7> [[TMP20]], i32 3
 ; RV32-NEXT:    store i7 [[TMP28]], ptr [[TMP24]], align 1
 ; RV32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; RV32-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
@@ -992,7 +992,15 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP38:%.*]] = insertelement <4 x i7> [[TMP37]], i7 [[TMP34]], i32 2
 ; RV64-UF2-NEXT:    [[TMP39:%.*]] = insertelement <4 x i7> [[TMP38]], i7 [[TMP35]], i32 3
 ; RV64-UF2-NEXT:    [[TMP40:%.*]] = add <4 x i7> [[TMP31]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
+; RV64-UF2-NEXT:    [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
+; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
+; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
 ; RV64-UF2-NEXT:    [[TMP41:%.*]] = add <4 x i7> [[TMP39]], splat (i7 1)
+; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
+; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
+; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
+; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
 ; RV64-UF2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP8]]
 ; RV64-UF2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP9]]
 ; RV64-UF2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP10]]
@@ -1001,21 +1009,13 @@ define void @vector_reverse_irregular_type(ptr noalias %A, ptr noalias %B) {
 ; RV64-UF2-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP13]]
 ; RV64-UF2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP14]]
 ; RV64-UF2-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i7, ptr [[A]], i64 [[TMP15]]
-; RV64-UF2-NEXT:    [[TMP50:%.*]] = extractelement <4 x i7> [[TMP40]], i32 0
 ; RV64-UF2-NEXT:    store i7 [[TMP50]], ptr [[TMP42]], align 1
-; RV64-UF2-NEXT:    [[TMP51:%.*]] = extractelement <4 x i7> [[TMP40]], i32 1
 ; RV64-UF2-NEXT:    store i7 [[TMP51]], ptr [[TMP43]], align 1
-; RV64-UF2-NEXT:    [[TMP52:%.*]] = extractelement <4 x i7> [[TMP40]], i32 2
 ; RV64-UF2-NEXT:    store i7 [[TMP52]], ptr [[TMP44]], align 1
-; RV64-UF2-NEXT:    [[TMP53:%.*]] = extractelement <4 x i7> [[TMP40]], i32 3
 ; RV64-UF2-NEXT:    store i7 [[TMP53]], ptr [[TMP45]], align 1
-; RV64-UF2-NEXT:    [[TMP54:%.*]] = extractelement <4 x i7> [[TMP41]], i32 0
 ; RV64-UF2-NEXT:    store i7 [[TMP54]], ptr [[TMP46]], align 1
-; RV64-UF2-NEXT:    [[TMP55:%.*]] = extractelement <4 x i7> [[TMP41]], i32 1
 ; RV64-UF2-NEXT:    store i7 [[TMP55]], ptr [[TMP47]], align 1
-; RV64-UF2-NEXT:    [[TMP56:%.*]] = extractelement <4 x i7> [[TMP41]], i32 2
 ; RV64-UF2-NEXT:    store i7 [[TMP56]], ptr [[TMP48]], align 1
-; RV64-UF2-NEXT:    [[TMP57:%.*]] = extractelement <4 x i7> [[TMP41]], i32 3
 ; RV64-UF2-NEXT:    store i7 [[TMP57]], ptr [[TMP49]], align 1
 ; RV64-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; RV64-UF2-NEXT:    [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index e84c0d6..8e562a9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -203,7 +203,7 @@ define void @safe_load_store_distance_not_pow_of_2(i64 %N) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 3, i64 6, i64 9, i64 12, i64 15, i64 18, i64 21>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [10 x [12 x i16]], ptr @a, i64 0, i64 8, <8 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> zeroinitializer, <8 x ptr> [[TMP7]], i32 2, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> zeroinitializer, <8 x ptr> align 2 [[TMP7]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 24)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index c66d8d6..723b5e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -63,12 +63,12 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], splat (i64 8)
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP10]]
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; CHECK-UF2-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -165,12 +165,12 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[TMP6]]
 ; CHECK-UF2-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[STEP_ADD]]
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; CHECK-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[TMP6]]
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -269,8 +269,8 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-UF2-NEXT:    [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 8)
 ; CHECK-UF2-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP11]]
-; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-UF2-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-UF2-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-UF2-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -280,8 +280,8 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-UF2-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-UF2-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[TMP13]], splat (i32 1)
 ; CHECK-UF2-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[TMP15]], splat (i32 1)
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[VECTOR_GEP]], i32 4, <vscale x 4 x i1> splat (i1 true))
-; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> [[STEP_ADD]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP17]], <vscale x 4 x ptr> align 4 [[STEP_ADD]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-UF2-NEXT:    [[TMP18:%.*]] = mul i64 8, [[TMP4]]
 ; CHECK-UF2-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP18]]
@@ -893,14 +893,14 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:    [[TMP34:%.*]] = mul nuw nsw <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT11]]
 ; STRIDED-UF2-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP33]]
 ; STRIDED-UF2-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP34]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP35]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP36]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP35]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8:![0-9]+]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP36]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META8]]
 ; STRIDED-UF2-NEXT:    [[TMP37:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP38:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER12]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP33]]
 ; STRIDED-UF2-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[P2]], <vscale x 4 x i64> [[TMP34]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP37]], <vscale x 4 x ptr> [[TMP39]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP38]], <vscale x 4 x ptr> [[TMP40]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META8]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP37]], <vscale x 4 x ptr> align 4 [[TMP39]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META8]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP38]], <vscale x 4 x ptr> align 4 [[TMP40]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META8]]
 ; STRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]]
 ; STRIDED-UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; STRIDED-UF2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1227,12 +1227,12 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-UF2-NEXT:    [[VECTOR_GEP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 4 x i64> [[TMP16]]
 ; STRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP12]], <vscale x 4 x i64> [[TMP14]]
 ; STRIDED-UF2-NEXT:    [[STEP_ADD13:%.*]] = getelementptr i8, <vscale x 4 x ptr> [[VECTOR_GEP]], <vscale x 4 x i64> [[TMP14]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[VECTOR_GEP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15:![0-9]+]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[STEP_ADD]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[VECTOR_GEP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15:![0-9]+]]
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[STEP_ADD]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !alias.scope [[META15]]
 ; STRIDED-UF2-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 1)
 ; STRIDED-UF2-NEXT:    [[TMP20:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER14]], splat (i32 1)
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> [[VECTOR_GEP]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
-; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> [[STEP_ADD13]], i32 4, <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18]], !noalias [[META15]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP19]], <vscale x 4 x ptr> align 4 [[VECTOR_GEP]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; STRIDED-UF2-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x ptr> align 4 [[STEP_ADD13]], <vscale x 4 x i1> splat (i1 true)), !alias.scope [[META18]], !noalias [[META15]]
 ; STRIDED-UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; STRIDED-UF2-NEXT:    [[TMP21:%.*]] = mul i64 [[STRIDE]], [[TMP9]]
 ; STRIDED-UF2-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP21]]
@@ -1348,8 +1348,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; NOSTRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
 ; NOSTRIDED-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
 ; NOSTRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; NOSTRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
@@ -1432,8 +1432,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; STRIDED-UF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[IN:%.*]], <vscale x 2 x i64> [[VEC_IND]]
 ; STRIDED-UF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], <vscale x 2 x i64> [[STEP_ADD]]
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
-; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP8]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
+; STRIDED-UF2-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
 ; STRIDED-UF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[INDEX]]
 ; STRIDED-UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-UF2-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 46daee4..a07e031 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -173,12 +173,12 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -294,11 +294,11 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i32> [[VEC_IND4]], i32 0
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 1
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 3
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -413,12 +413,12 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP10]], [[WIDE_MASKED_GATHER1]]
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -535,11 +535,11 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_IND4:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i32> [[VEC_IND4]], i32 0
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 1
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i32> [[VEC_IND4]], i32 2
-; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; NO-VP-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[VEC_IND4]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND4]], [[BROADCAST_SPLAT3]]
 ; NO-VP-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -657,15 +657,15 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; NO-VP-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER3]]
 ; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[TMP12]], [[WIDE_MASKED_GATHER4]]
 ; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; NO-VP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
index b13c671..e9dcf8f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
@@ -52,9 +52,9 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; NO-VP-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
-; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP7]])
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[INC]]
 ; NO-VP-NEXT:    br i1 [[TMP10]], label [[FOR_INC:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index e70894b..7b0ac78 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -164,7 +164,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]]
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL:       loopend:
@@ -199,7 +199,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; NO-VP-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP11]], i64 [[TMP12]]
 ; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP14]]
 ; NO-VP-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
 ; NO-VP-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
 ; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
 ; NO-VP-NEXT:    [[TMP18:%.*]] = mul i64 0, [[TMP3]]
@@ -209,7 +209,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP20]]
 ; NO-VP-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
 ; NO-VP-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE2]])
-; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[REVERSE3]])
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE4]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[REVERSE3]])
 ; NO-VP-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; NO-VP-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; NO-VP-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -316,7 +316,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
 ; IF-EVL:       exit:
@@ -336,7 +336,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; NO-VP-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; NO-VP-NEXT:    [[REVERSE:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], <16 x i8> [[REVERSE]]
-; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> [[TMP3]], i32 1, <16 x i1> splat (i1 true), <16 x i8> poison)
+; NO-VP-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> align 1 [[TMP3]], <16 x i1> splat (i1 true), <16 x i8> poison)
 ; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[OFFSET_IDX]]
 ; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
 ; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 -15
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index c5319c6..f4c7c6f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -246,6 +246,57 @@ exit:
   ret void
 }
 
+; Test for https://github.com/llvm/llvm-project/issues/162688.
+define void @test_minbws_for_trunc(i32 %n, ptr noalias %p1, ptr noalias %p2) {
+; CHECK-LABEL: define void @test_minbws_for_trunc(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_EXT:%.*]] = sext i16 [[IV]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV_EXT]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[V1_TRUNC:%.*]] = trunc i32 [[V1]] to i16
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr [1 x [1 x i16]], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i16 [[V1_TRUNC]], ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[V1_TRUNC_I8:%.*]] = trunc i32 [[V1]] to i8
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i8 [[V1_TRUNC_I8]], ptr [[GEP3]], align 1
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr [1 x i64], ptr [[P2]], i64 [[IV_EXT]]
+; CHECK-NEXT:    store i64 0, ptr [[GEP4]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 4
+; CHECK-NEXT:    [[IV_NEXT_EXT:%.*]] = sext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[IV_NEXT_EXT]], 1024
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %iv.ext = sext i16 %iv to i64
+  %gep1 = getelementptr i32, ptr %p1, i64 %iv.ext
+  %v1 = load i32, ptr %gep1, align 4
+  %v1.trunc = trunc i32 %v1 to i16
+  %gep2 = getelementptr [1 x [1 x i16]], ptr %p2, i64 %iv.ext
+  store i16 %v1.trunc, ptr %gep2, align 2
+  %v1.trunc.i8 = trunc i32 %v1 to i8
+  %gep3 = getelementptr i8, ptr %p2, i64 %iv.ext
+  store i8 %v1.trunc.i8, ptr %gep3, align 1
+  %gep4 = getelementptr [1 x i64], ptr %p2, i64 %iv.ext
+  store i64 0, ptr %gep4, align 8
+  %iv.next = add i16 %iv, 4
+  %iv.next.ext = sext i16 %iv.next to i32
+  %cmp = icmp ne i32 %iv.next.ext, 1024
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 attributes #1 = { "target-features"="+64bit,+v" }
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index c34417b..1e21c75 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -289,8 +289,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], splat (i64 10)
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP1]], <4 x i64> poison)
-; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[BROADCAST_SPLAT]], i32 8, <4 x i1> [[TMP2]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[BROADCAST_SPLAT]], <4 x i1> [[TMP1]], <4 x i64> poison)
+; FIXEDLEN-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[BROADCAST_SPLAT]], <4 x i1> [[TMP2]], <4 x i64> poison)
 ; FIXEDLEN-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> [[WIDE_MASKED_GATHER]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[WIDE_MASKED_GATHER1]], <4 x i64> zeroinitializer
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
@@ -753,8 +753,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; FIXEDLEN-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; FIXEDLEN-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i64> [[VEC_IND]], splat (i64 10)
 ; FIXEDLEN-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i64> [[STEP_ADD]], splat (i64 10)
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP1]])
-; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT2]], i32 8, <4 x i1> [[TMP2]])
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> align 8 [[BROADCAST_SPLAT2]], <4 x i1> [[TMP1]])
+; FIXEDLEN-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[BROADCAST_SPLAT]], <4 x ptr> align 8 [[BROADCAST_SPLAT2]], <4 x i1> [[TMP2]])
 ; FIXEDLEN-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; FIXEDLEN-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4
 ; FIXEDLEN-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 27d5e64..193b5d4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -46,7 +46,7 @@ define void @PR31671(float %x, ptr %d) #0 {
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP1]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> [[TMP2]], i32 4, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP4]], <16 x ptr> align 4 [[TMP2]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 80)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6384
@@ -107,24 +107,24 @@ define void @PR31671(float %x, ptr %d) #0 {
 ; FORCE-NEXT:    [[WIDE_VEC13:%.*]] = load <10 x float>, ptr [[TMP22]], align 4
 ; FORCE-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <10 x float> [[WIDE_VEC13]], <10 x float> poison, <2 x i32> <i32 0, i32 5>
 ; FORCE-NEXT:    [[TMP24:%.*]] = fadd <2 x float> [[STRIDED_VEC8]], [[TMP12]]
+; FORCE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
+; FORCE-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
 ; FORCE-NEXT:    [[TMP25:%.*]] = fadd <2 x float> [[STRIDED_VEC10]], [[TMP13]]
+; FORCE-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
+; FORCE-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
 ; FORCE-NEXT:    [[TMP26:%.*]] = fadd <2 x float> [[STRIDED_VEC12]], [[TMP14]]
+; FORCE-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0
+; FORCE-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1
 ; FORCE-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[STRIDED_VEC14]], [[TMP15]]
-; FORCE-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
+; FORCE-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
+; FORCE-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
 ; FORCE-NEXT:    store float [[TMP28]], ptr [[TMP16]], align 4
-; FORCE-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
 ; FORCE-NEXT:    store float [[TMP29]], ptr [[TMP17]], align 4
-; FORCE-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
 ; FORCE-NEXT:    store float [[TMP30]], ptr [[TMP18]], align 4
-; FORCE-NEXT:    [[TMP31:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
 ; FORCE-NEXT:    store float [[TMP31]], ptr [[TMP19]], align 4
-; FORCE-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP26]], i32 0
 ; FORCE-NEXT:    store float [[TMP32]], ptr [[TMP20]], align 4
-; FORCE-NEXT:    [[TMP33:%.*]] = extractelement <2 x float> [[TMP26]], i32 1
 ; FORCE-NEXT:    store float [[TMP33]], ptr [[TMP21]], align 4
-; FORCE-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP27]], i32 0
 ; FORCE-NEXT:    store float [[TMP34]], ptr [[TMP22]], align 4
-; FORCE-NEXT:    [[TMP35:%.*]] = extractelement <2 x float> [[TMP27]], i32 1
 ; FORCE-NEXT:    store float [[TMP35]], ptr [[TMP23]], align 4
 ; FORCE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6392
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 590b269..baedf34 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -24,39 +24,39 @@ define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i1> [[TMP6]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP9]], i32 4, <16 x i1> [[TMP7]], <16 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP9]], <16 x i1> [[TMP7]], <16 x float> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <16 x float> [[WIDE_MASKED_LOAD]], splat (float 2.000000e+00)
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP10]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x float> [[TMP10]], i32 4
-; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x float> [[TMP10]], i32 5
-; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x float> [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x float> [[TMP10]], i32 7
-; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x float> [[TMP10]], i32 8
-; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x float> [[TMP10]], i32 9
-; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x float> [[TMP10]], i32 10
-; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x float> [[TMP10]], i32 11
-; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x float> [[TMP10]], i32 12
-; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x float> [[TMP10]], i32 13
-; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x float> [[TMP10]], i32 14
-; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP10]], i32 15
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call float @llvm.pow.f32(float [[TMP11]], float [[X:%.*]])
+; CHECK-NEXT:    [[TMP14:%.*]] = tail call float @llvm.pow.f32(float [[TMP13]], float [[X]])
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.pow.f32(float [[TMP15]], float [[X]])
+; CHECK-NEXT:    [[TMP18:%.*]] = tail call float @llvm.pow.f32(float [[TMP17]], float [[X]])
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call float @llvm.pow.f32(float [[TMP19]], float [[X]])
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call float @llvm.pow.f32(float [[TMP21]], float [[X]])
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call float @llvm.pow.f32(float [[TMP23]], float [[X]])
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call float @llvm.pow.f32(float [[TMP25]], float [[X]])
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call float @llvm.pow.f32(float [[TMP27]], float [[X]])
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call float @llvm.pow.f32(float [[TMP29]], float [[X]])
+; CHECK-NEXT:    [[TMP32:%.*]] = tail call float @llvm.pow.f32(float [[TMP31]], float [[X]])
+; CHECK-NEXT:    [[TMP34:%.*]] = tail call float @llvm.pow.f32(float [[TMP33]], float [[X]])
+; CHECK-NEXT:    [[TMP36:%.*]] = tail call float @llvm.pow.f32(float [[TMP35]], float [[X]])
+; CHECK-NEXT:    [[TMP38:%.*]] = tail call float @llvm.pow.f32(float [[TMP37]], float [[X]])
+; CHECK-NEXT:    [[TMP40:%.*]] = tail call float @llvm.pow.f32(float [[TMP39]], float [[X]])
 ; CHECK-NEXT:    [[TMP42:%.*]] = tail call float @llvm.pow.f32(float [[TMP41]], float [[X]])
 ; CHECK-NEXT:    [[TMP43:%.*]] = tail call float @llvm.pow.f32(float [[TMP12]], float [[X]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = tail call float @llvm.pow.f32(float [[TMP14]], float [[X]])
@@ -90,8 +90,8 @@ define void @test_replicate_call_chain(float %x, ptr noalias %A, ptr noalias %B,
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x float> [[TMP71]], float [[TMP56]], i32 13
 ; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x float> [[TMP72]], float [[TMP57]], i32 14
 ; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x float> [[TMP73]], float [[TMP58]], i32 15
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr [[TMP5]], i32 4, <16 x i1> [[TMP7]])
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr [[TMP5]], i32 4, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP74]], ptr align 4 [[TMP5]], <16 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> zeroinitializer, ptr align 4 [[TMP5]], <16 x i1> [[TMP6]])
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[E:%.*]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -324,7 +324,7 @@ define i64 @avx512_cond_load_cost(ptr %src, i32 %a, i64 %b, i32 %c, i32 %d) #1 {
 ; CHECK-NEXT:    [[TMP67:%.*]] = or <8 x i32> [[TMP66]], [[TMP34]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = sext <8 x i32> [[TMP67]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr { i64, i64, i64 }, ptr [[SRC:%.*]], <8 x i64> [[TMP68]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> [[TMP69]], i32 8, <8 x i1> [[TMP1]], <8 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> align 8 [[TMP69]], <8 x i1> [[TMP1]], <8 x i64> poison)
 ; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i64> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP1]], <8 x i64> [[TMP70]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
index 04fd289..62fda4e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
@@ -37,7 +37,7 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP26]], <4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -149,7 +149,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP28]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr align 8 [[TMP28]], <4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
index 249efe1..c5f581fa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -26,7 +26,7 @@ define i1 @fn(ptr %nno) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[REVERSE]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP6]], <4 x i1> [[REVERSE]], <4 x i32> poison)
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 8126c70..4423b89e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -28,7 +28,7 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -70,7 +70,7 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -112,7 +112,7 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -155,7 +155,7 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -197,7 +197,7 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> <i64 -2, i64 0, i64 2, i64 4>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -240,9 +240,9 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[INPUT]], <4 x i64> <i64 -1, i64 0, i64 1, i64 2>
-; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP6]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -301,7 +301,7 @@ define void @drop_nonvector_nuw_nsw_avx1(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x ptr> [[TMP15]], ptr [[TMP12]], i32 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x ptr> [[TMP16]], ptr [[TMP13]], i32 3
 ; CHECK-NEXT:    store <4 x ptr> [[TMP17]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP10]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[TMP10]], <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP21]], align 4
@@ -383,7 +383,7 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[INPUT]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr align 4 [[INPUT]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -432,14 +432,14 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[VEC_IND]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP3]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP0]], <4 x double> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    store double [[TMP5]], ptr [[P1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
 entry:
@@ -479,7 +479,7 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> <i64 0, i64 0, i64 1, i64 1>
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x float> poison), !invariant.load [[META0]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_GATHER]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
@@ -807,7 +807,7 @@ define void @Bgep_inbounds_unconditionally_due_to_store(ptr noalias %B, ptr read
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
index 2ef9d4b..3718ad2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
@@ -14,7 +14,7 @@ define x86_fp80 @test() {
 ; CHECK-NEXT:    br label [[FOR_BODY3_I_3:%.*]]
 ; CHECK:       for.body3.i.3:
 ; CHECK-NEXT:    [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ]
-; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ]
+; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 0xK00000000000000000000, [[FOO_EXIT]] ]
 ; CHECK-NEXT:    [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000
 ; CHECK-NEXT:    [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1
 ; CHECK-NEXT:    [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1
@@ -28,7 +28,7 @@ foo.exit:
 
 for.body3.i.3:                                    ; preds = %for.body3.i.3, %foo.exit
   %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ]
-  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ]
+  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ zeroinitializer, %foo.exit ]
   %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000
   %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1
   %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2f33e11..db592f9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -33,13 +33,13 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP4]], <16 x i1> [[TMP3]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP7]], <16 x i1> [[TMP3]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP9]], i32 4, <16 x i1> [[TMP3]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr align 4 [[TMP9]], <16 x i1> [[TMP3]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX512-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -59,13 +59,13 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; FVW2-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[TMP4]], <2 x i1> [[TMP3]], <2 x i32> poison)
 ; FVW2-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
 ; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]]
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP7]], <2 x i1> [[TMP3]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[INDEX1]]
-; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP9]], i32 4, <2 x i1> [[TMP3]])
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr align 4 [[TMP9]], <2 x i1> [[TMP3]])
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; FVW2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -126,13 +126,13 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -161,7 +161,7 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -241,13 +241,13 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -276,7 +276,7 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -343,13 +343,13 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -378,7 +378,7 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -444,13 +444,13 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -479,7 +479,7 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -545,13 +545,13 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[TMP0]], <16 x i1> splat (i1 true), <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
 ; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
 ; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> align 4 [[TMP4]], <16 x i1> [[TMP1]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
 ; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
@@ -580,7 +580,7 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
 ; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
-; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 [[TMP9]], <2 x i1> [[TMP8]], <2 x float> poison)
 ; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
 ; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -685,10 +685,10 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX]]
 ; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP17]], align 4, !alias.scope [[META8:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> align 4 [[TMP14]], <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD6]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD6]], <16 x ptr> align 4 [[TMP20]], <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX512-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024
 ; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -721,10 +721,10 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[OFFSET_IDX21]]
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
 ; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x float>, ptr [[TMP29]], align 4, !alias.scope [[META8]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD13]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD13]], <8 x ptr> align 4 [[TMP26]], <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x float>, ptr [[TMP28]], align 4, !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD14]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD14]], <8 x ptr> align 4 [[TMP32]], <8 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
 ; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
 ; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
 ; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
@@ -786,16 +786,16 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
 ; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP21]], align 4, !alias.scope [[META8:![0-9]+]]
 ; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
+; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
 ; FVW2-NEXT:    store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP16]], align 4, !alias.scope [[META15:![0-9]+]]
-; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
-; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
 ; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
-; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
-; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]]
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
+; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
+; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP25]], align 4, !alias.scope [[META11]], !noalias [[META13]]
+; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP22]], align 4, !alias.scope [[META11]], !noalias [[META13]]
 ; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FVW2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FVW2-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
index 2f44c7e..8eb2c0c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -15,7 +15,7 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP8]], i32 2, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr align 2 [[TMP8]], <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -82,11 +82,11 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10)
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP6]], i32 2, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr align 2 [[TMP6]], <4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index a19b294..080b4e8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -31,10 +31,10 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <8 x i64> [[STEP_ADD2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VEC_IND4]], <8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD5]], <8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD6]], <8 x ptr> [[TMP3]], i32 8, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD7]], <8 x ptr> [[TMP4]], i32 8, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[VEC_IND4]], <8 x ptr> align 8 [[TMP1]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD5]], <8 x ptr> align 8 [[TMP2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD6]], <8 x ptr> align 8 [[TMP3]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[STEP_ADD7]], <8 x ptr> align 8 [[TMP4]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <8 x i32> [[STEP_ADD7]], splat (i32 8)
@@ -63,7 +63,7 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 {
 ; CHECK-NEXT:    [[VEC_IND12:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[VEC_IND17:%.*]] = phi <4 x i32> [ [[INDUCTION16]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT18:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr { i32, [8 x i32] }, ptr [[DST]], <4 x i64> [[VEC_IND12]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VEC_IND17]], <4 x ptr> align 8 [[TMP7]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX11]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT13]] = add <4 x i64> [[VEC_IND12]], splat (i64 4)
 ; CHECK-NEXT:    [[VEC_IND_NEXT18]] = add <4 x i32> [[VEC_IND17]], splat (i32 4)
@@ -297,36 +297,36 @@ define void @multiple_pointer_ivs_with_scalar_uses_only(ptr %A, ptr %B) #0 {
 ; CHECK-NEXT:    [[TMP24:%.*]] = lshr <16 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc <16 x i32> [[TMP24]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[TMP25]], i32 0
-; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP25]], i32 1
-; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[TMP25]], i32 2
-; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i8> [[TMP25]], i32 3
-; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[TMP25]], i32 4
-; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP25]], i32 5
-; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[TMP25]], i32 6
-; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i8> [[TMP25]], i32 7
-; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[TMP25]], i32 8
-; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP25]], i32 9
-; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[TMP25]], i32 10
-; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i8> [[TMP25]], i32 11
-; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[TMP25]], i32 12
-; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP25]], i32 13
-; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[TMP25]], i32 14
-; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x i8> [[TMP25]], i32 15
+; CHECK-NEXT:    store i8 [[TMP26]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META18:![0-9]+]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP27]], ptr [[NEXT_GEP7]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP28]], ptr [[NEXT_GEP8]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP29]], ptr [[NEXT_GEP9]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP30]], ptr [[NEXT_GEP10]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP31]], ptr [[NEXT_GEP11]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP32]], ptr [[NEXT_GEP12]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP33]], ptr [[NEXT_GEP13]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP34]], ptr [[NEXT_GEP14]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP35]], ptr [[NEXT_GEP15]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP36]], ptr [[NEXT_GEP16]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP37]], ptr [[NEXT_GEP17]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP38]], ptr [[NEXT_GEP18]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP39]], ptr [[NEXT_GEP19]], align 1, !alias.scope [[META18]], !noalias [[META15]]
+; CHECK-NEXT:    store i8 [[TMP40]], ptr [[NEXT_GEP20]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    store i8 [[TMP41]], ptr [[NEXT_GEP21]], align 1, !alias.scope [[META18]], !noalias [[META15]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967184
@@ -465,10 +465,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP16]], <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr align 4 [[TMP26]], <8 x i1> [[TMP10]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr align 4 [[TMP27]], <8 x i1> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -494,7 +494,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT6]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr align 4 [[TMP35]], <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 2f96278..9240484 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -360,7 +360,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META15:![0-9]+]], !noalias [[META13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP54]], <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META15:![0-9]+]], !noalias [[META13]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
index 8e0401d..14fb2a7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-ptradd-with-replicated-operand.ll
@@ -69,40 +69,40 @@ define ptr @test_interleave_ptradd_with_replicated_op(ptr %m) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC25:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC26:%.*]] = shufflevector <8 x i32> [[WIDE_VEC24]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <4 x i32> [[STRIDED_VEC17]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3
 ; CHECK-NEXT:    [[TMP37:%.*]] = add <4 x i32> [[STRIDED_VEC20]], [[STRIDED_VEC19]]
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3
 ; CHECK-NEXT:    [[TMP38:%.*]] = add <4 x i32> [[STRIDED_VEC23]], [[STRIDED_VEC22]]
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2
+; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3
 ; CHECK-NEXT:    [[TMP39:%.*]] = add <4 x i32> [[STRIDED_VEC26]], [[STRIDED_VEC25]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP40]], ptr [[NEXT_GEP12]], align 4
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i32> [[TMP36]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[NEXT_GEP2]], align 4
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[TMP36]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP42]], ptr [[NEXT_GEP3]], align 4
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[TMP36]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP43]], ptr [[NEXT_GEP4]], align 4
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i32> [[TMP37]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP44]], ptr [[NEXT_GEP13]], align 4
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP37]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP45]], ptr [[NEXT_GEP6]], align 4
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP37]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP46]], ptr [[NEXT_GEP7]], align 4
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP37]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP47]], ptr [[NEXT_GEP8]], align 4
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP38]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP48]], ptr [[NEXT_GEP14]], align 4
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i32> [[TMP38]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[NEXT_GEP10]], align 4
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i32> [[TMP38]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP50]], ptr [[NEXT_GEP11]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <4 x i32> [[TMP38]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP51]], ptr [[NEXT_GEP17]], align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP52]], ptr [[NEXT_GEP15]], align 4
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[NEXT_GEP18]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[NEXT_GEP19]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[NEXT_GEP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
index d17361a..829fdff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll
@@ -28,18 +28,18 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <12 x i32>, ptr [[TMP17]], align 4
@@ -47,13 +47,13 @@ define void @pr63602_1(ptr %arr) {
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC2]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
-; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -141,18 +141,18 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
-; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
-; CHECK-NEXT:    store i32 [[TMP18]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
-; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP1]], 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[TMP3]], 2
@@ -161,10 +161,10 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i32 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 1
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 2
@@ -179,13 +179,13 @@ define void @pr63602_2(ptr %arr) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = add <4 x i32> [[TMP35]], [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    store i32 [[TMP45]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    store i32 [[TMP46]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store i32 [[TMP47]], ptr [[TMP18]], align 4
+; CHECK-NEXT:    store i32 [[TMP48]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
index 3efb82de..0678e9e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll
@@ -26,14 +26,14 @@ define void @avoid_sinking_store_across_load(ptr %arr) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[STRIDED_VEC]], splat (i32 25)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP7]], <4 x ptr> [[TMP6]], i32 4, <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[STRIDED_VEC]], splat (i32 25)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP7]], <4 x ptr> align 4 [[TMP6]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC4:%.*]] = load <12 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], splat (i64 12)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index 9a3616a..0bc86ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -36,7 +36,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT5]], ptr [[TMP2]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> align 4 [[BROADCAST_SPLAT]], <16 x i1> [[TMP1]], <16 x i32> poison), !alias.scope [[META3]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -62,7 +62,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) {
 ; CHECK-NEXT:    [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX10]]
 ; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT12]], ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[BROADCAST_SPLAT9]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[BROADCAST_SPLAT9]], <8 x i1> [[TMP5]], <8 x i32> poison), !alias.scope [[META3]]
 ; CHECK-NEXT:    [[PREDPHI14:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[WIDE_MASKED_GATHER13]], <8 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX10]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC7]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 5d40e6a..7b9166d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -160,7 +160,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT5]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META11]], !noalias [[META14]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT]], <16 x ptr> [[BROADCAST_SPLAT7]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META14]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[BROADCAST_SPLAT]], <16 x ptr> align 4 [[BROADCAST_SPLAT7]], <16 x i1> [[TMP2]]), !alias.scope [[META14]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -306,8 +306,8 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT13]]
 ; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 4, !alias.scope [[META20]], !noalias [[META23]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 8, <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope [[META26:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> [[BROADCAST_SPLAT15]], i32 4, <16 x i1> [[TMP2]]), !alias.scope [[META27:![0-9]+]], !noalias [[META26]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 8 [[TMP3]], <16 x i1> [[TMP2]], <16 x i32> poison), !alias.scope [[META26:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x ptr> align 4 [[BROADCAST_SPLAT15]], <16 x i1> [[TMP2]]), !alias.scope [[META27:![0-9]+]], !noalias [[META26]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
@@ -335,8 +335,8 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD25]], [[BROADCAST_SPLAT23]]
 ; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT21]], ptr [[TMP5]], align 4, !alias.scope [[META20]], !noalias [[META23]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX18]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 8, <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD26]], <8 x ptr> [[BROADCAST_SPLAT26]], i32 4, <8 x i1> [[TMP6]]), !alias.scope [[META27]], !noalias [[META26]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 8 [[TMP7]], <8 x i1> [[TMP6]], <8 x i32> poison), !alias.scope [[META26]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[WIDE_MASKED_LOAD26]], <8 x ptr> align 4 [[BROADCAST_SPLAT26]], <8 x i1> [[TMP6]]), !alias.scope [[META27]], !noalias [[META26]]
 ; CHECK-NEXT:    [[INDEX_NEXT27]] = add nuw i64 [[INDEX18]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT27]], [[N_VEC17]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
index 78f96ca..bcb6b5c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/iv-live-outs.ll
@@ -29,10 +29,10 @@ define i64 @test_pr98660(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD3]], zeroinitializer
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP9]], i32 4, <8 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP14]], i32 4, <8 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP15]], i32 4, <8 x i1> [[TMP19]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP16]], i32 4, <8 x i1> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP9]], <8 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP14]], <8 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP15]], <8 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[TMP16]], <8 x i1> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 2a8c698..8771dc9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -721,10 +721,10 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) {
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8
 ; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP65]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP65]], <4 x i1> [[TMP40]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP48]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP56]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP72]], <4 x i1> [[TMP64]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1234,10 +1234,10 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1365,10 +1365,10 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1496,10 +1496,10 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1796,10 +1796,10 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -1928,10 +1928,10 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
@@ -2070,10 +2070,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8
 ; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP64]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP64]], <4 x i1> [[TMP39]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP69]], <4 x i1> [[TMP47]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP70]], <4 x i1> [[TMP55]], <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP71]], <4 x i1> [[TMP63]], <4 x i32> poison)
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
index 9d4ddf9..2c172b2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll
@@ -38,10 +38,10 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP16]], i32 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP16]], i32 16
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[TMP16]], i32 24
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP16]], i32 4, <8 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr [[TMP26]], i32 4, <8 x i1> [[TMP10]])
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP16]], <8 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP22]], ptr align 4 [[TMP26]], <8 x i1> [[TMP10]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP23]], ptr align 4 [[TMP27]], <8 x i1> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -67,7 +67,7 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 {
 ; CHECK-NEXT:    [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP33]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP36]], ptr align 4 [[TMP35]], <4 x i1> [[TMP33]])
 ; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX4]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 100
@@ -132,7 +132,7 @@ define void @test_scalar_cost_single_store_loop_invariant_cond(ptr %dst, i1 %c)
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <8 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr align 4 [[NEXT_GEP]], <8 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -200,8 +200,8 @@ define void @test_scalar_cost_single_store_loop_varying_cond(ptr %dst, ptr noali
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC]], splat (i32 123)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i32> [[STRIDED_VEC5]], splat (i32 123)
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[NEXT_GEP]], i32 4, <4 x i1> [[TMP8]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <4 x i1> [[TMP9]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[NEXT_GEP]], <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> zeroinitializer, ptr align 4 [[TMP11]], <4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index b907e7e..6558f76 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -40,10 +40,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr [[TMP9]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP8]], ptr align 4 [[TMP9]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -87,10 +87,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -99,10 +99,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP17]], ptr align 4 [[TMP21]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP18]], ptr align 4 [[TMP23]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP19]], ptr align 4 [[TMP24]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP20]], ptr align 4 [[TMP25]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -119,10 +119,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP31]], <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP33]], ptr align 4 [[TMP34]], <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -166,10 +166,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -178,10 +178,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr [[TMP21]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr [[TMP23]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr [[TMP24]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP17]], ptr align 4 [[TMP21]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP18]], ptr align 4 [[TMP23]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP19]], ptr align 4 [[TMP24]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP20]], ptr align 4 [[TMP25]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -198,10 +198,10 @@ define void @foo1(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP31]], <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr [[TMP34]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP33]], ptr align 4 [[TMP34]], <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -261,10 +261,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x i32> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x i32> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP8]], ptr addrspace(1) align 4 [[TMP9]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -308,10 +308,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x i32> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -320,10 +320,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 8
 ; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP17]], ptr addrspace(1) align 4 [[TMP21]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP18]], ptr addrspace(1) align 4 [[TMP23]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP19]], ptr addrspace(1) align 4 [[TMP24]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP20]], ptr addrspace(1) align 4 [[TMP25]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -340,10 +340,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr addrspace(1) [[TMP38]], align 4
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <8 x i1> [[TMP30]], <8 x i32> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1(ptr addrspace(1) align 4 [[TMP31]], <8 x i1> [[TMP30]], <8 x i32> poison)
 ; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <8 x i1> [[TMP30]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP33]], ptr addrspace(1) align 4 [[TMP34]], <8 x i1> [[TMP30]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -387,10 +387,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x i32> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
 ; AVX512-NEXT:    [[TMP18:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
 ; AVX512-NEXT:    [[TMP19:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
@@ -399,10 +399,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 16
 ; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 32
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(1) [[TMP21]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) [[TMP21]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) [[TMP23]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) [[TMP24]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) [[TMP25]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP17]], ptr addrspace(1) align 4 [[TMP21]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP18]], ptr addrspace(1) align 4 [[TMP23]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP19]], ptr addrspace(1) align 4 [[TMP24]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP20]], ptr addrspace(1) align 4 [[TMP25]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -419,10 +419,10 @@ define void @foo1_addrspace1(ptr addrspace(1) nocapture %A, ptr addrspace(1) noc
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr addrspace(1) [[TMP28]], align 4
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(1) [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) [[TMP31]], i32 4, <16 x i1> [[TMP30]], <16 x i32> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1(ptr addrspace(1) align 4 [[TMP31]], <16 x i1> [[TMP30]], <16 x i32> poison)
 ; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD13]], [[WIDE_LOAD12]]
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr addrspace(1) [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) [[TMP34]], i32 4, <16 x i1> [[TMP30]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1(<16 x i32> [[TMP33]], ptr addrspace(1) align 4 [[TMP34]], <16 x i1> [[TMP30]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -491,11 +491,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; AVX1-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], splat (i32 100)
 ; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP6]], i32 4, <8 x i1> [[TMP5]], <8 x float> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP6]], <8 x i1> [[TMP5]], <8 x float> poison)
 ; AVX1-NEXT:    [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX1-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
-; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr [[TMP10]], i32 4, <8 x i1> [[TMP5]])
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP9]], ptr align 4 [[TMP10]], <8 x i1> [[TMP5]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -539,10 +539,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 8
 ; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 24
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP12]], i32 4, <8 x i1> [[TMP8]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP14]], i32 4, <8 x i1> [[TMP9]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP11]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP12]], <8 x i1> [[TMP8]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP14]], <8 x i1> [[TMP9]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP15]], <8 x i1> [[TMP10]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP16]], <8 x i1> [[TMP11]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
 ; AVX2-NEXT:    [[TMP18:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
 ; AVX2-NEXT:    [[TMP19:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
@@ -555,10 +555,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 8
 ; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX2-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 24
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr [[TMP25]], i32 4, <8 x i1> [[TMP8]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr [[TMP27]], i32 4, <8 x i1> [[TMP9]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP10]])
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr [[TMP29]], i32 4, <8 x i1> [[TMP11]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP21]], ptr align 4 [[TMP25]], <8 x i1> [[TMP8]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP22]], ptr align 4 [[TMP27]], <8 x i1> [[TMP9]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP23]], ptr align 4 [[TMP28]], <8 x i1> [[TMP10]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP24]], ptr align 4 [[TMP29]], <8 x i1> [[TMP11]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX2-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -575,11 +575,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, ptr [[TMP43]], align 4
 ; AVX2-NEXT:    [[TMP34:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP35]], i32 4, <8 x i1> [[TMP34]], <8 x float> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP35]], <8 x i1> [[TMP34]], <8 x float> poison)
 ; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
 ; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr [[TMP39]], i32 4, <8 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP38]], ptr align 4 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 8
 ; AVX2-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX2-NEXT:    br i1 [[TMP41]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -623,10 +623,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 16
 ; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr float, ptr [[TMP12]], i32 32
 ; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr float, ptr [[TMP12]], i32 48
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP12]], i32 4, <16 x i1> [[TMP8]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP14]], i32 4, <16 x i1> [[TMP9]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP16]], i32 4, <16 x i1> [[TMP11]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP12]], <16 x i1> [[TMP8]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP14]], <16 x i1> [[TMP9]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP15]], <16 x i1> [[TMP10]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP16]], <16 x i1> [[TMP11]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP17:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
 ; AVX512-NEXT:    [[TMP18:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
 ; AVX512-NEXT:    [[TMP19:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
@@ -639,10 +639,10 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr float, ptr [[TMP25]], i32 16
 ; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, ptr [[TMP25]], i32 32
 ; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP25]], i32 48
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr [[TMP25]], i32 4, <16 x i1> [[TMP8]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr [[TMP27]], i32 4, <16 x i1> [[TMP9]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr [[TMP28]], i32 4, <16 x i1> [[TMP10]])
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr [[TMP29]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP21]], ptr align 4 [[TMP25]], <16 x i1> [[TMP8]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP22]], ptr align 4 [[TMP27]], <16 x i1> [[TMP9]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP23]], ptr align 4 [[TMP28]], <16 x i1> [[TMP10]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP24]], ptr align 4 [[TMP29]], <16 x i1> [[TMP11]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; AVX512-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -659,11 +659,11 @@ define void @foo2(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, ptr [[TMP32]], align 4
 ; AVX512-NEXT:    [[TMP34:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX11]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[TMP35]], i32 4, <16 x i1> [[TMP34]], <16 x float> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 4 [[TMP35]], <16 x i1> [[TMP34]], <16 x float> poison)
 ; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
 ; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD13]], [[TMP37]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX11]]
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr [[TMP39]], i32 4, <16 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP38]], ptr align 4 [[TMP39]], <16 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX11]], 16
 ; AVX512-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 10000
 ; AVX512-NEXT:    br i1 [[TMP41]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -747,10 +747,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP10]], <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META11:![0-9]+]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP12]], <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META11]]
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP14]], <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META11]]
 ; AVX1-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
 ; AVX1-NEXT:    [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
 ; AVX1-NEXT:    [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
@@ -763,10 +763,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr align 8 [[TMP23]], <4 x i1> [[TMP6]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr align 8 [[TMP25]], <4 x i1> [[TMP7]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr align 8 [[TMP26]], <4 x i1> [[TMP8]]), !alias.scope [[META13]], !noalias [[META15]]
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr align 8 [[TMP27]], <4 x i1> [[TMP9]]), !alias.scope [[META13]], !noalias [[META15]]
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -810,10 +810,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 4
 ; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP10]], i32 8, <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META15:![0-9]+]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP12]], i32 8, <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META15]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META15]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP14]], i32 8, <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP10]], <4 x i1> [[TMP6]], <4 x double> poison), !alias.scope [[META15:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP12]], <4 x i1> [[TMP7]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP8]], <4 x double> poison), !alias.scope [[META15]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP14]], <4 x i1> [[TMP9]], <4 x double> poison), !alias.scope [[META15]]
 ; AVX2-NEXT:    [[TMP15:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
 ; AVX2-NEXT:    [[TMP16:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x double>
 ; AVX2-NEXT:    [[TMP17:%.*]] = sitofp <4 x i32> [[WIDE_LOAD7]] to <4 x double>
@@ -826,10 +826,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
 ; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr [[TMP23]], i32 8, <4 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr [[TMP25]], i32 8, <4 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr [[TMP26]], i32 8, <4 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr [[TMP27]], i32 8, <4 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP19]], ptr align 8 [[TMP23]], <4 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP20]], ptr align 8 [[TMP25]], <4 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP21]], ptr align 8 [[TMP26]], <4 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP22]], ptr align 8 [[TMP27]], <4 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; AVX2-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -875,10 +875,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP10]], i32 8
 ; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i32 16
 ; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP10]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP10]], i32 8, <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META15:![0-9]+]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP12]], i32 8, <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META15]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META15]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP14]], i32 8, <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP10]], <8 x i1> [[TMP6]], <8 x double> poison), !alias.scope [[META15:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP12]], <8 x i1> [[TMP7]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP13]], <8 x i1> [[TMP8]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD11:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP14]], <8 x i1> [[TMP9]], <8 x double> poison), !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP15:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
 ; AVX512-NEXT:    [[TMP16:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x double>
 ; AVX512-NEXT:    [[TMP17:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x double>
@@ -891,10 +891,10 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 8
 ; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 16
 ; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr [[TMP23]], i32 8, <8 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr [[TMP25]], i32 8, <8 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr [[TMP26]], i32 8, <8 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr [[TMP27]], i32 8, <8 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP19]], ptr align 8 [[TMP23]], <8 x i1> [[TMP6]]), !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP20]], ptr align 8 [[TMP25]], <8 x i1> [[TMP7]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP21]], ptr align 8 [[TMP26]], <8 x i1> [[TMP8]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP22]], ptr align 8 [[TMP27]], <8 x i1> [[TMP9]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
 ; AVX512-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -911,11 +911,11 @@ define void @foo3(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, ptr [[TMP41]], align 4, !alias.scope [[META12]]
 ; AVX512-NEXT:    [[TMP31:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX12]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP32]], i32 8, <8 x i1> [[TMP31]], <8 x double> poison), !alias.scope [[META15]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP32]], <8 x i1> [[TMP31]], <8 x double> poison), !alias.scope [[META15]]
 ; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double>
 ; AVX512-NEXT:    [[TMP34:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD14]], [[TMP33]]
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX12]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP34]], ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]]), !alias.scope [[META17]], !noalias [[META19]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[TMP34]], ptr align 8 [[TMP35]], <8 x i1> [[TMP31]]), !alias.scope [[META17]], !noalias [[META19]]
 ; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; AVX512-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
 ; AVX512-NEXT:    br i1 [[TMP39]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -1011,15 +1011,15 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP0]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META24:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP0]], <8 x i1> splat (i1 true), <8 x i32> poison), !alias.scope [[META24:![0-9]+]]
 ; AVX512-NEXT:    [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; AVX512-NEXT:    [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], splat (i64 1)
 ; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], <8 x i64> [[TMP2]]
-; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META27:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> align 8 [[TMP3]], <8 x i1> [[TMP1]], <8 x double> poison), !alias.scope [[META27:![0-9]+]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
 ; AVX512-NEXT:    [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER6]], [[TMP4]]
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[A]], <8 x i64> [[VEC_IND]]
-; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope [[META29:![0-9]+]], !noalias [[META31:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0(<8 x double> [[TMP5]], <8 x ptr> align 8 [[TMP6]], <8 x i1> [[TMP1]]), !alias.scope [[META29:![0-9]+]], !noalias [[META31:![0-9]+]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 128)
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
@@ -1147,16 +1147,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -12
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -3
 ; AVX2-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP16]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP16]], <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope [[META25:![0-9]+]]
 ; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP18]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP18]], <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP20]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP20]], <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP22]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META25]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP22]], <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope [[META25]]
 ; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP23:%.*]] = fadd <4 x double> [[REVERSE13]], splat (double 5.000000e-01)
 ; AVX2-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[REVERSE16]], splat (double 5.000000e-01)
@@ -1172,13 +1172,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -12
 ; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -3
 ; AVX2-NEXT:    [[REVERSE24:%.*]] = shufflevector <4 x double> [[TMP23]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE24]], ptr align 8 [[TMP29]], <4 x i1> [[REVERSE12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META29:![0-9]+]]
 ; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE14]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE26]], ptr align 8 [[TMP31]], <4 x i1> [[REVERSE14]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x double> [[TMP25]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE28]], ptr align 8 [[TMP33]], <4 x i1> [[REVERSE17]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[REVERSE30:%.*]] = shufflevector <4 x double> [[TMP26]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope [[META27]], !noalias [[META29]]
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[REVERSE30]], ptr align 8 [[TMP35]], <4 x i1> [[REVERSE20]]), !alias.scope [[META27]], !noalias [[META29]]
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX2-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -1238,16 +1238,16 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP14]], i32 -24
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP21]], i32 -7
 ; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP16]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP16]], <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope [[META37:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP18]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP18]], <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP20]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP20]], <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP22]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META37]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr align 8 [[TMP22]], <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope [[META37]]
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP23:%.*]] = fadd <8 x double> [[REVERSE13]], splat (double 5.000000e-01)
 ; AVX512-NEXT:    [[TMP24:%.*]] = fadd <8 x double> [[REVERSE16]], splat (double 5.000000e-01)
@@ -1263,13 +1263,13 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[TMP27]], i32 -24
 ; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP34]], i32 -7
 ; AVX512-NEXT:    [[REVERSE24:%.*]] = shufflevector <8 x double> [[TMP23]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE24]], ptr align 8 [[TMP29]], <8 x i1> [[REVERSE12]]), !alias.scope [[META39:![0-9]+]], !noalias [[META41:![0-9]+]]
 ; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x double> [[TMP24]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE14]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE26]], ptr align 8 [[TMP31]], <8 x i1> [[REVERSE14]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x double> [[TMP25]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr [[TMP33]], i32 8, <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE28]], ptr align 8 [[TMP33]], <8 x i1> [[REVERSE17]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[REVERSE30:%.*]] = shufflevector <8 x double> [[TMP26]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr [[TMP35]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope [[META39]], !noalias [[META41]]
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[REVERSE30]], ptr align 8 [[TMP35]], <8 x i1> [[REVERSE20]]), !alias.scope [[META39]], !noalias [[META41]]
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
 ; AVX512-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
@@ -1351,10 +1351,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX1-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1367,10 +1367,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -1392,11 +1392,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
@@ -1443,10 +1443,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP18]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX2-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1459,10 +1459,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -1484,11 +1484,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
@@ -1535,10 +1535,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP18]], <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP20]], <8 x i1> [[TMP15]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP21]], <8 x i1> [[TMP16]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP22]], <8 x i1> [[TMP17]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX512-NEXT:    [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1551,10 +1551,10 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <8 x i1> [[TMP33]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
@@ -1576,11 +1576,11 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP47]], <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
@@ -1672,10 +1672,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 12
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP18]], i32 8, <4 x i1> [[TMP14]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP18]], <4 x i1> [[TMP14]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP21]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP22]], <4 x i1> [[TMP17]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX1-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX1-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1688,10 +1688,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX1-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -1713,11 +1713,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX1-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX1-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX1-NEXT:    [[TMP45:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP45]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP45]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX1-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX1-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX1-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX1-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX1-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -1764,10 +1764,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 4
 ; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 8
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr ptr, ptr [[TMP13]], i32 12
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP13]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP19]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison)
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP23]], i32 8, <4 x i1> [[TMP12]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP13]], <4 x i1> [[TMP17]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP19]], <4 x i1> [[TMP15]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP20]], <4 x i1> [[TMP16]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP23]], <4 x i1> [[TMP12]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP18:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer
 ; AVX2-NEXT:    [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX2-NEXT:    [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1780,10 +1780,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 4
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 12
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <4 x i1> [[TMP31]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <4 x i1> [[TMP32]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <4 x i1> [[TMP33]])
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <4 x i1> [[TMP34]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <4 x i1> [[TMP31]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <4 x i1> [[TMP32]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <4 x i1> [[TMP33]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <4 x i1> [[TMP34]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
@@ -1805,11 +1805,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX2-NEXT:    [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX2-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer
 ; AVX2-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP47]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison)
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr align 8 [[TMP47]], <4 x i1> [[TMP46]], <4 x ptr> poison)
 ; AVX2-NEXT:    [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX2-NEXT:    [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer
 ; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <4 x i1> [[TMP51]])
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <4 x i1> [[TMP51]])
 ; AVX2-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4
 ; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX2-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
@@ -1856,10 +1856,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8
 ; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 16
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 24
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP18]], i32 8, <8 x i1> [[TMP14]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison)
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP18]], <8 x i1> [[TMP14]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP20]], <8 x i1> [[TMP15]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP21]], <8 x i1> [[TMP16]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP22]], <8 x i1> [[TMP17]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AVX512-NEXT:    [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer
 ; AVX512-NEXT:    [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer
@@ -1872,10 +1872,10 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[TMP35]], i32 8
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[TMP35]], i32 16
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[TMP35]], i32 24
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP35]], i32 8, <8 x i1> [[TMP31]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP37]], i32 8, <8 x i1> [[TMP32]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP38]], i32 8, <8 x i1> [[TMP33]])
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP39]], i32 8, <8 x i1> [[TMP34]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP35]], <8 x i1> [[TMP31]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP37]], <8 x i1> [[TMP32]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP38]], <8 x i1> [[TMP33]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP39]], <8 x i1> [[TMP34]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[TMP40]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
@@ -1897,11 +1897,11 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in
 ; AVX512-NEXT:    [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1)
 ; AVX512-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer
 ; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[INDEX10]]
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP47]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison)
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr align 8 [[TMP47]], <8 x i1> [[TMP46]], <8 x ptr> poison)
 ; AVX512-NEXT:    [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer
 ; AVX512-NEXT:    [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer
 ; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[INDEX10]]
-; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr [[TMP52]], i32 8, <8 x i1> [[TMP51]])
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> splat (double 5.000000e-01), ptr align 8 [[TMP52]], <8 x i1> [[TMP51]])
 ; AVX512-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 8
 ; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC9]]
 ; AVX512-NEXT:    br i1 [[TMP54]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index e1140b5..5e1850b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -23,10 +23,10 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP2]], <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr align 1 [[TMP2]], <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -48,10 +48,10 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 [[TMP2]], <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr align 1 [[TMP2]], <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -95,10 +95,10 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], splat (i32 202)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]], <64 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0(ptr align 1 [[TMP2]], <64 x i1> [[TMP1]], <64 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> splat (i8 2), <64 x i8> splat (i8 1)
-; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <64 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0(<64 x i8> [[TMP5]], ptr align 1 [[TMP2]], <64 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -120,10 +120,10 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], splat (i32 202)
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[INDEX]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]], <32 x i8> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr align 1 [[TMP2]], <32 x i1> [[TMP1]], <32 x i8> poison)
 ; AUTOVF-NEXT:    [[TMP4:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
 ; AUTOVF-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> splat (i8 2), <32 x i8> splat (i8 1)
-; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr [[TMP2]], i32 1, <32 x i1> [[TMP1]])
+; AUTOVF-NEXT:    call void @llvm.masked.store.v32i8.p0(<32 x i8> [[TMP5]], ptr align 1 [[TMP2]], <32 x i1> [[TMP1]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -169,7 +169,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <64 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <64 x i32> [[TMP1]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> [[TMP2]], i32 4, <64 x i1> splat (i1 true), <64 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0(<64 x ptr> align 4 [[TMP2]], <64 x i1> splat (i1 true), <64 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; CHECK-NEXT:    store <64 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
@@ -194,7 +194,7 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; AUTOVF-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; AUTOVF-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; AUTOVF-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <8 x i32> [[TMP1]]
-; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; AUTOVF-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP2]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; AUTOVF-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
 ; AUTOVF-NEXT:    store <8 x i32> [[WIDE_MASKED_GATHER]], ptr [[TMP3]], align 4
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -352,7 +352,7 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <64 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> [[TMP5]], i32 8, <64 x i1> [[TMP6]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v64p0.v64p0(<64 x ptr> zeroinitializer, <64 x ptr> align 8 [[TMP5]], <64 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -4608
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -388,7 +388,7 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AUTOVF-NEXT:    [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AUTOVF-NEXT:    [[TMP6:%.*]] = icmp ule <8 x i32> [[VEC_IV]], [[BROADCAST_SPLAT4]]
-; AUTOVF-NEXT:    call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> [[TMP5]], i32 8, <8 x i1> [[TMP6]])
+; AUTOVF-NEXT:    call void @llvm.masked.scatter.v8p0.v8p0(<8 x ptr> zeroinitializer, <8 x ptr> align 8 [[TMP5]], <8 x i1> [[TMP6]])
 ; AUTOVF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; AUTOVF-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 -576
 ; AUTOVF-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 16b31ae..113bb7a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -37,13 +37,13 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -66,13 +66,13 @@ define void @foo(i32 %n) {
 ; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; AVX:       [[VECTOR_BODY]]:
-; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x ptr> getelementptr inbounds ([8 x i32], ptr @arr2, <8 x i64> zeroinitializer, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>), i32 4, <8 x i1> splat (i1 true))
+; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x ptr> align 4 getelementptr inbounds ([8 x i32], ptr @arr2, <8 x i64> zeroinitializer, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>), <8 x i1> splat (i1 true))
 ; AVX-NEXT:    [[TMP2:%.*]] = add nsw <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[BROADCAST_SPLAT]]
 ; AVX-NEXT:    br label %[[FOR_BODY31:.*]]
 ; AVX:       [[FOR_BODY31]]:
 ; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <8 x i64> [[VEC_PHI]], <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP2]], <8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true))
+; AVX-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP2]], <8 x ptr> align 4 [[TMP3]], <8 x i1> splat (i1 true))
 ; AVX-NEXT:    [[TMP4]] = add nuw nsw <8 x i64> [[VEC_PHI]], splat (i64 1)
 ; AVX-NEXT:    [[TMP5:%.*]] = icmp eq <8 x i64> [[TMP4]], splat (i64 8)
 ; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 5d76dfb..66809eb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -73,6 +73,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
@@ -92,14 +96,10 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
-; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
-; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP17]], align 4, !llvm.access.group [[ACC_GRP1:![0-9]+]]
+; CHECK-NEXT:    store i32 [[TMP21]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[TMP19]], align 4, !llvm.access.group [[ACC_GRP1]]
+; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP1]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP27]], i64 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
index b6acf38..485e3c4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
@@ -31,7 +31,7 @@ define ptr @foo(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <4 x ptr> [[TMP5]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD_2]], <4 x i64> splat (i64 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP8]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -91,7 +91,7 @@ define ptr @bar(ptr %p, ptr %p.last) unnamed_addr #0 {
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = getelementptr i8, <4 x ptr> [[TMP5]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD]], <4 x i64> splat (i64 4096)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, <4 x ptr> [[STEP_ADD_2]], <4 x i64> splat (i64 4096)
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP8]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP8]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16384
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 286da4d..785a248 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -44,18 +44,18 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 0
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP18]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP19]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP20]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> [[TMP21]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP18]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP19]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP20]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT]], <4 x ptr addrspace(13)> align 8 [[TMP21]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD]], i32 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD4]], i32 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[STEP_ADD5]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP22]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP23]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP24]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> [[TMP25]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP22]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP23]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP24]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT8]], <4 x ptr addrspace(13)> align 8 [[TMP25]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -82,9 +82,9 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo
 ; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> align 8 [[TMP28]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> [[TMP29]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p13(<4 x i64> [[BROADCAST_SPLAT13]], <4 x ptr addrspace(13)> align 8 [[TMP29]], <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 3616379..ba7db65 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -33,7 +33,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 8 [[TMP8]], <4 x i1> [[REVERSE]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
index e25be6f..2aceb27 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll
@@ -28,7 +28,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; COST-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 -12)
 ; COST-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], splat (i64 13)
 ; COST-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP10]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -85,8 +85,8 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], splat (i64 13)
 ; FORCED-NEXT:    [[TMP15:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]]
 ; FORCED-NEXT:    [[TMP16:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -165,7 +165,7 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; COST-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true)
 ; COST-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]]
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -227,8 +227,8 @@ define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr
 ; FORCED-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]]
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP17]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP17]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP18]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -352,14 +352,14 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP14]]
 ; FORCED-NEXT:    [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP19]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[TMP8]], <4 x i1> [[TMP14]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP12]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP10]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP19]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP20]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -463,10 +463,10 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; COST-NEXT:    [[TMP11:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true)
 ; COST-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer
 ; COST-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP13]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP13]])
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP7]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP7]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -544,14 +544,14 @@ define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %e
 ; FORCED-NEXT:    [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP21:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP22:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP21]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP21]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 1), ptr align 1 [[TMP8]], <4 x i1> [[TMP22]])
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP9]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP24]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP9]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP10]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -710,12 +710,12 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP38:%.*]] = or <4 x i1> [[TMP28]], [[TMP36]]
 ; FORCED-NEXT:    [[TMP39:%.*]] = xor <4 x i1> [[TMP37]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP40:%.*]] = xor <4 x i1> [[TMP38]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP35]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP36]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP27]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP28]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP39]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP40]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP35]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP36]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP27]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP28]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP39]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP40]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -847,12 +847,12 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP12]]
 ; FORCED-NEXT:    [[TMP20:%.*]] = xor <4 x i1> [[TMP17]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP21:%.*]] = xor <4 x i1> [[TMP18]], splat (i1 true)
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP20]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP21]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP12]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP20]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP21]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -952,9 +952,9 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; COST-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]]
 ; COST-NEXT:    [[TMP13:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true)
 ; COST-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP7]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP11]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP10]])
-; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP14]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP11]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP10]])
+; COST-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP14]])
 ; COST-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; COST-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; COST-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1033,12 +1033,12 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[TMP20]], splat (i1 true)
 ; FORCED-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP24:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP25]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP26]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP15]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP23]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP25]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP26]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP15]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP16]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP23]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP24]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
@@ -1186,16 +1186,16 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end,
 ; FORCED-NEXT:    [[TMP28:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP29]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP29]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr align 1 [[TMP8]], <4 x i1> [[TMP30]])
 ; FORCED-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer
 ; FORCED-NEXT:    [[TMP33:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP32]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP33]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP32]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP33]])
 ; FORCED-NEXT:    [[TMP36:%.*]] = or <4 x i1> [[TMP32]], [[TMP15]]
 ; FORCED-NEXT:    [[TMP37:%.*]] = or <4 x i1> [[TMP33]], [[TMP16]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP36]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr [[TMP8]], i32 1, <4 x i1> [[TMP37]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP36]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 2), ptr align 1 [[TMP8]], <4 x i1> [[TMP37]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -1356,8 +1356,8 @@ define void @large_number_of_cases(ptr %start, ptr %end) {
 ; FORCED-NEXT:    [[TMP40:%.*]] = or <4 x i1> [[TMP38]], [[TMP24]]
 ; FORCED-NEXT:    [[TMP57:%.*]] = or <4 x i1> [[TMP39]], [[TMP25]]
 ; FORCED-NEXT:    [[TMP58:%.*]] = or <4 x i1> [[TMP40]], [[TMP26]]
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[NEXT_GEP]], i32 1, <4 x i1> [[TMP57]])
-; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr [[TMP8]], i32 1, <4 x i1> [[TMP58]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[NEXT_GEP]], <4 x i1> [[TMP57]])
+; FORCED-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> splat (i64 42), ptr align 1 [[TMP8]], <4 x i1> [[TMP58]])
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; FORCED-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; FORCED-NEXT:    br i1 [[TMP59]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
index df1c4f9..5321d69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -21,10 +21,10 @@ while.cond63.preheader.while.end76_crit_edge:
   ret void
 
 while.body:
-  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
-  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
-  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
-  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ 0.0e+00, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ 0.0e+00, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ 0.0e+00, %entry ]
   %div50 = fmul double %d3_fx.012, 1.250000e-01
   %sub52 = fsub double 0.000000e+00, %div50
   %div56 = fmul double %d3_fy.013, 1.250000e-01
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll
new file mode 100644
index 0000000..5011852
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs-max-bandwidth.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -mtriple=x86_64-apple-macosx -vectorizer-maximize-bandwidth -S %s | FileCheck %s
+
+declare void @init(ptr captures(none), ptr captures(none), ptr captures(none))
+
+define void @replicating_store_with_phi_addr1(ptr noalias %array, i64 %N, i32 %x, i1 %cond) {
+; CHECK-LABEL: define void @replicating_store_with_phi_addr1(
+; CHECK-SAME: ptr noalias [[ARRAY:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTR1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR3:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @init(ptr [[PTR1]], ptr [[PTR2]], ptr [[PTR3]])
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x ptr> poison, ptr [[PTR1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x ptr> [[BROADCAST_SPLATINSERT]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i1> poison, i1 [[COND]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT1]], <16 x i1> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE32:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP7]], i32 6
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP8]], i32 7
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP9]], i32 8
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP10]], i32 9
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP11]], i32 10
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP12]], i32 11
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP13]], i32 12
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP14]], i32 13
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP15]], i32 14
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP16]], i32 15
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt <16 x i32> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp slt i32 [[X]], [[TMP1]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp slt i32 [[X]], [[TMP2]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp slt i32 [[X]], [[TMP3]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt i32 [[X]], [[TMP4]]
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp slt i32 [[X]], [[TMP5]]
+; CHECK-NEXT:    [[TMP40:%.*]] = icmp slt i32 [[X]], [[TMP6]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp slt i32 [[X]], [[TMP7]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[X]], [[TMP8]]
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp slt i32 [[X]], [[TMP9]]
+; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[X]], [[TMP10]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp slt i32 [[X]], [[TMP11]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp slt i32 [[X]], [[TMP12]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp slt i32 [[X]], [[TMP13]]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[X]], [[TMP14]]
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp slt i32 [[X]], [[TMP15]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp slt i32 [[X]], [[TMP16]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP35]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP36]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[TMP37]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP38]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP39]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP40]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP57:%.*]] = select i1 [[TMP41]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP42]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP43]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP44]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP61:%.*]] = select i1 [[TMP45]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP46]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP47]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP64:%.*]] = select i1 [[TMP48]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP49]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[TMP50]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <16 x ptr> poison, ptr [[TMP51]], i32 0
+; CHECK-NEXT:    [[TMP68:%.*]] = insertelement <16 x ptr> [[TMP67]], ptr [[TMP52]], i32 1
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <16 x ptr> [[TMP68]], ptr [[TMP53]], i32 2
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <16 x ptr> [[TMP69]], ptr [[TMP54]], i32 3
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <16 x ptr> [[TMP70]], ptr [[TMP55]], i32 4
+; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <16 x ptr> [[TMP71]], ptr [[TMP56]], i32 5
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <16 x ptr> [[TMP72]], ptr [[TMP57]], i32 6
+; CHECK-NEXT:    [[TMP74:%.*]] = insertelement <16 x ptr> [[TMP73]], ptr [[TMP58]], i32 7
+; CHECK-NEXT:    [[TMP75:%.*]] = insertelement <16 x ptr> [[TMP74]], ptr [[TMP59]], i32 8
+; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <16 x ptr> [[TMP75]], ptr [[TMP60]], i32 9
+; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <16 x ptr> [[TMP76]], ptr [[TMP61]], i32 10
+; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <16 x ptr> [[TMP77]], ptr [[TMP62]], i32 11
+; CHECK-NEXT:    [[TMP79:%.*]] = insertelement <16 x ptr> [[TMP78]], ptr [[TMP63]], i32 12
+; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <16 x ptr> [[TMP79]], ptr [[TMP64]], i32 13
+; CHECK-NEXT:    [[TMP81:%.*]] = insertelement <16 x ptr> [[TMP80]], ptr [[TMP65]], i32 14
+; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <16 x ptr> [[TMP81]], ptr [[TMP66]], i32 15
+; CHECK-NEXT:    [[TMP83:%.*]] = select <16 x i1> [[TMP33]], <16 x i1> [[BROADCAST_SPLAT2]], <16 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP34]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP33]], <16 x ptr> [[BROADCAST_SPLAT]], <16 x ptr> [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <16 x i1> [[TMP84]], i32 0
+; CHECK-NEXT:    br i1 [[TMP85]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP86:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 0
+; CHECK-NEXT:    store i8 0, ptr [[TMP86]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP87:%.*]] = extractelement <16 x i1> [[TMP84]], i32 1
+; CHECK-NEXT:    br i1 [[TMP87]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
+; CHECK:       [[PRED_STORE_IF3]]:
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 1
+; CHECK-NEXT:    store i8 0, ptr [[TMP88]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <16 x i1> [[TMP84]], i32 2
+; CHECK-NEXT:    br i1 [[TMP89]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK:       [[PRED_STORE_IF5]]:
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 2
+; CHECK-NEXT:    store i8 0, ptr [[TMP90]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; CHECK:       [[PRED_STORE_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <16 x i1> [[TMP84]], i32 3
+; CHECK-NEXT:    br i1 [[TMP91]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK:       [[PRED_STORE_IF7]]:
+; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i8 0, ptr [[TMP92]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <16 x i1> [[TMP84]], i32 4
+; CHECK-NEXT:    br i1 [[TMP93]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 4
+; CHECK-NEXT:    store i8 0, ptr [[TMP94]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    [[TMP95:%.*]] = extractelement <16 x i1> [[TMP84]], i32 5
+; CHECK-NEXT:    br i1 [[TMP95]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK:       [[PRED_STORE_IF11]]:
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 5
+; CHECK-NEXT:    store i8 0, ptr [[TMP96]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <16 x i1> [[TMP84]], i32 6
+; CHECK-NEXT:    br i1 [[TMP97]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; CHECK:       [[PRED_STORE_IF13]]:
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 6
+; CHECK-NEXT:    store i8 0, ptr [[TMP98]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <16 x i1> [[TMP84]], i32 7
+; CHECK-NEXT:    br i1 [[TMP99]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; CHECK:       [[PRED_STORE_IF15]]:
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 7
+; CHECK-NEXT:    store i8 0, ptr [[TMP100]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
+; CHECK:       [[PRED_STORE_CONTINUE16]]:
+; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <16 x i1> [[TMP84]], i32 8
+; CHECK-NEXT:    br i1 [[TMP101]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
+; CHECK:       [[PRED_STORE_IF17]]:
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 8
+; CHECK-NEXT:    store i8 0, ptr [[TMP102]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
+; CHECK:       [[PRED_STORE_CONTINUE18]]:
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <16 x i1> [[TMP84]], i32 9
+; CHECK-NEXT:    br i1 [[TMP103]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
+; CHECK:       [[PRED_STORE_IF19]]:
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 9
+; CHECK-NEXT:    store i8 0, ptr [[TMP104]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
+; CHECK:       [[PRED_STORE_CONTINUE20]]:
+; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <16 x i1> [[TMP84]], i32 10
+; CHECK-NEXT:    br i1 [[TMP105]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
+; CHECK:       [[PRED_STORE_IF21]]:
+; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 10
+; CHECK-NEXT:    store i8 0, ptr [[TMP106]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
+; CHECK:       [[PRED_STORE_CONTINUE22]]:
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <16 x i1> [[TMP84]], i32 11
+; CHECK-NEXT:    br i1 [[TMP107]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
+; CHECK:       [[PRED_STORE_IF23]]:
+; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 11
+; CHECK-NEXT:    store i8 0, ptr [[TMP108]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
+; CHECK:       [[PRED_STORE_CONTINUE24]]:
+; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <16 x i1> [[TMP84]], i32 12
+; CHECK-NEXT:    br i1 [[TMP109]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]]
+; CHECK:       [[PRED_STORE_IF25]]:
+; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 12
+; CHECK-NEXT:    store i8 0, ptr [[TMP110]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE26]]
+; CHECK:       [[PRED_STORE_CONTINUE26]]:
+; CHECK-NEXT:    [[TMP111:%.*]] = extractelement <16 x i1> [[TMP84]], i32 13
+; CHECK-NEXT:    br i1 [[TMP111]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]]
+; CHECK:       [[PRED_STORE_IF27]]:
+; CHECK-NEXT:    [[TMP112:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 13
+; CHECK-NEXT:    store i8 0, ptr [[TMP112]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE28]]
+; CHECK:       [[PRED_STORE_CONTINUE28]]:
+; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i1> [[TMP84]], i32 14
+; CHECK-NEXT:    br i1 [[TMP113]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]]
+; CHECK:       [[PRED_STORE_IF29]]:
+; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 14
+; CHECK-NEXT:    store i8 0, ptr [[TMP114]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
+; CHECK:       [[PRED_STORE_CONTINUE30]]:
+; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <16 x i1> [[TMP84]], i32 15
+; CHECK-NEXT:    br i1 [[TMP115]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_IF31]]:
+; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <16 x ptr> [[PREDPHI]], i32 15
+; CHECK-NEXT:    store i8 0, ptr [[TMP116]], align 1
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE32]]
+; CHECK:       [[PRED_STORE_CONTINUE32]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP117:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP117]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 96, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[ARRAY_PTR:%.*]] = load ptr, ptr [[ARRAY]], align 8
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAY_PTR]], align 4
+; CHECK-NEXT:    [[CMP_POS:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP_POS]], label %[[ELSE:.*]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[CMP_X:%.*]] = icmp slt i32 [[X]], [[VAL]]
+; CHECK-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[CMP_X]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    br label %[[MERGE_AND_STORE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[MERGE_AND_STORE]], label %[[LOOP_LATCH]]
+; CHECK:       [[MERGE_AND_STORE]]:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi ptr [ [[SELECT_PTR]], %[[THEN]] ], [ [[PTR1]], %[[ELSE]] ]
+; CHECK-NEXT:    store i8 0, ptr [[PTR_PHI]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr1 = alloca i8, align 1
+  %ptr2 = alloca i8, align 1
+  %ptr3 = alloca i8, align 1
+  call void @init(ptr %ptr1, ptr %ptr2, ptr %ptr3)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %ec = icmp ne i64 %iv, 99
+  br i1 %ec, label %loop.body, label %exit
+
+loop.body:
+  %array.ptr = load ptr, ptr %array, align 8
+  %val = load i32, ptr %array.ptr, align 4
+  %cmp.pos = icmp sgt i32 %val, 0
+  br i1 %cmp.pos, label %else, label %then
+
+then:
+  %cmp.x = icmp slt i32 %x, %val
+  %select.ptr = select i1 %cmp.x, ptr %ptr2, ptr %ptr3
+  br label %merge.and.store
+
+else:
+  br i1 %cond, label %merge.and.store, label %loop.latch
+
+merge.and.store:
+  %ptr.phi = phi ptr [ %select.ptr, %then ], [ %ptr1, %else ]
+  store i8 0, ptr %ptr.phi, align 1
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+define void @replicating_store_with_phi_addr2(ptr noalias %array, ptr noalias %base, i64 %N, i32 %x, i1 %cond) {
+; CHECK-LABEL: define void @replicating_store_with_phi_addr2(
+; CHECK-SAME: ptr noalias [[ARRAY:%.*]], ptr noalias [[BASE:%.*]], i64 [[N:%.*]], i32 [[X:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[PTR1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[PTR3:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @init(ptr [[PTR1]], ptr [[PTR2]], ptr [[PTR3]])
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_BODY:.*]], label %[[EXIT:.*]]
+; CHECK:       [[LOOP_BODY]]:
+; CHECK-NEXT:    [[GEP_ARRAY:%.*]] = getelementptr i32, ptr [[ARRAY]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_ARRAY]], align 8
+; CHECK-NEXT:    [[L_PTR:%.*]] = load ptr, ptr [[BASE]], align 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[L_PTR]], align 4
+; CHECK-NEXT:    [[CMP_POS:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP_POS]], label %[[ELSE:.*]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[SEL_CMP:%.*]] = icmp slt i32 [[X]], [[L]]
+; CHECK-NEXT:    [[SELECT_PTR:%.*]] = select i1 [[SEL_CMP]], ptr [[PTR2]], ptr [[PTR3]]
+; CHECK-NEXT:    br label %[[MERGE_AND_STORE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[MERGE_AND_STORE]], label %[[LOOP_LATCH]]
+; CHECK:       [[MERGE_AND_STORE]]:
+; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi ptr [ [[SELECT_PTR]], %[[THEN]] ], [ [[PTR1]], %[[ELSE]] ]
+; CHECK-NEXT:    store i8 0, ptr [[PTR_PHI]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr1 = alloca i8, align 1
+  %ptr2 = alloca i8, align 1
+  %ptr3 = alloca i8, align 1
+  call void @init(ptr %ptr1, ptr %ptr2, ptr %ptr3)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %ec = icmp ne i64 %iv, 99
+  br i1 %ec, label %loop.body, label %exit
+
+loop.body:
+  %gep.array = getelementptr i32, ptr %array, i64 %iv
+  %l = load i32, ptr %gep.array, align 8
+  %l.ptr = load ptr, ptr %base, align 4
+  %val = load i32, ptr %l.ptr, align 4
+  %cmp.pos = icmp sgt i32 %val, 0
+  br i1 %cmp.pos, label %else, label %then
+
+then:
+  %sel.cmp = icmp slt i32 %x, %l
+  %select.ptr = select i1 %sel.cmp, ptr %ptr2, ptr %ptr3
+  br label %merge.and.store
+
+else:
+  br i1 %cond, label %merge.and.store, label %loop.latch
+
+merge.and.store:
+  %ptr.phi = phi ptr [ %select.ptr, %then ], [ %ptr1, %else ]
+  store i8 0, ptr %ptr.phi, align 1
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver2" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 65058bd..14a8317 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -45,9 +45,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 14
 ; I64-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX]], 15
 ; I64-NEXT:    [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
+; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
+; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
+; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
 ; I64-NEXT:    [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
+; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
+; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
+; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
 ; I64-NEXT:    [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
+; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
+; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
+; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
 ; I64-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
+; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
+; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
+; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
 ; I64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
 ; I64-NEXT:    [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
 ; I64-NEXT:    [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
@@ -80,37 +96,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4
 ; I64-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
 ; I64-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
-; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
 ; I64-NEXT:    store double [[CONV]], ptr [[TMP0]], align 4
-; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
 ; I64-NEXT:    store double [[TMP57]], ptr [[TMP41]], align 4
-; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
 ; I64-NEXT:    store double [[TMP58]], ptr [[TMP42]], align 4
-; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
 ; I64-NEXT:    store double [[TMP59]], ptr [[TMP43]], align 4
-; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
 ; I64-NEXT:    store double [[TMP60]], ptr [[TMP44]], align 4
-; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
 ; I64-NEXT:    store double [[TMP61]], ptr [[TMP45]], align 4
-; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
 ; I64-NEXT:    store double [[TMP62]], ptr [[TMP46]], align 4
-; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
 ; I64-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
-; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
 ; I64-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
-; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
 ; I64-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
-; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
 ; I64-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
-; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
 ; I64-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
-; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
 ; I64-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
-; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
 ; I64-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
-; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
 ; I64-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
-; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
 ; I64-NEXT:    store double [[TMP71]], ptr [[TMP55]], align 4
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -139,21 +139,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 2
 ; I64-NEXT:    [[TMP78:%.*]] = add i32 [[INDEX4]], 3
 ; I64-NEXT:    [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
-; I64-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I64-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
-; I64-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
-; I64-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
-; I64-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
-; I64-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
-; I64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
-; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
 ; I64-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0
-; I64-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
 ; I64-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1
-; I64-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
 ; I64-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2
-; I64-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
 ; I64-NEXT:    [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3
+; I64-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT:    [[TMP86:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I64-NEXT:    [[TMP93:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
+; I64-NEXT:    [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4
+; I64-NEXT:    [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I64-NEXT:    [[TMP96:%.*]] = load ptr, ptr [[TMP86]], align 4
+; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP93]], align 4
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP94]], align 4
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP95]], align 4
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP96]], align 4
 ; I64-NEXT:    store double [[TMP91]], ptr [[TMP87]], align 4
 ; I64-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I64-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
@@ -201,9 +201,25 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP42:%.*]] = add i32 [[INDEX]], 14
 ; I32-NEXT:    [[TMP43:%.*]] = add i32 [[INDEX]], 15
 ; I32-NEXT:    [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
+; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
+; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
+; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
 ; I32-NEXT:    [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
+; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
+; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
+; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
 ; I32-NEXT:    [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
+; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
+; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
+; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
 ; I32-NEXT:    [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
+; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
+; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
+; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
+; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
 ; I32-NEXT:    [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
 ; I32-NEXT:    [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
 ; I32-NEXT:    [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
@@ -236,37 +252,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4
 ; I32-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4
 ; I32-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4
-; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
 ; I32-NEXT:    store double [[TMP31]], ptr [[TMP23]], align 4
-; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
 ; I32-NEXT:    store double [[TMP32]], ptr [[TMP24]], align 4
-; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
 ; I32-NEXT:    store double [[TMP33]], ptr [[TMP25]], align 4
-; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
 ; I32-NEXT:    store double [[TMP34]], ptr [[TMP26]], align 4
-; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
 ; I32-NEXT:    store double [[TMP35]], ptr [[TMP27]], align 4
-; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
 ; I32-NEXT:    store double [[TMP36]], ptr [[TMP28]], align 4
-; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
 ; I32-NEXT:    store double [[TMP37]], ptr [[TMP29]], align 4
-; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
 ; I32-NEXT:    store double [[TMP38]], ptr [[TMP30]], align 4
-; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
 ; I32-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
-; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
 ; I32-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
-; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
 ; I32-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
-; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
 ; I32-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
-; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
 ; I32-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
-; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
 ; I32-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
-; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
 ; I32-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
-; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
 ; I32-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; I32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
@@ -295,21 +295,21 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 2
 ; I32-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 3
 ; I32-NEXT:    [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
-; I32-NEXT:    [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
-; I32-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
-; I32-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
-; I32-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
-; I32-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4
-; I32-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
-; I32-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
-; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
 ; I32-NEXT:    [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0
-; I32-NEXT:    store double [[TMP87]], ptr [[TMP83]], align 4
 ; I32-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1
-; I32-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
 ; I32-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2
-; I32-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
 ; I32-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3
+; I32-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT:    [[TMP84:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT:    [[TMP85:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I32-NEXT:    [[TMP92:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I32-NEXT:    [[TMP93:%.*]] = load ptr, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP94:%.*]] = load ptr, ptr [[TMP84]], align 4
+; I32-NEXT:    [[TMP95:%.*]] = load ptr, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP92]], align 4
+; I32-NEXT:    store double [[TMP87]], ptr [[TMP93]], align 4
+; I32-NEXT:    store double [[TMP88]], ptr [[TMP94]], align 4
+; I32-NEXT:    store double [[TMP89]], ptr [[TMP95]], align 4
 ; I32-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
 ; I32-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
 ; I32-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
@@ -693,20 +693,20 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
 ; I32-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
 ; I32-NEXT:    [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
-; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
 ; I32-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
-; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
 ; I32-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
-; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
 ; I32-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
-; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
 ; I32-NEXT:    [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
-; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
 ; I32-NEXT:    [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
-; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
 ; I32-NEXT:    [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
-; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
 ; I32-NEXT:    [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
+; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
+; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
+; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
 ; I32-NEXT:    [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
 ; I32-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; I32-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
@@ -847,32 +847,32 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
 ; I64-NEXT:    [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0
 ; I64-NEXT:    [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1
 ; I64-NEXT:    [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]]
-; I64-NEXT:    [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
-; I64-NEXT:    [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
-; I64-NEXT:    [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
-; I64-NEXT:    [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
-; I64-NEXT:    [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
-; I64-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
-; I64-NEXT:    [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
-; I64-NEXT:    [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
-; I64-NEXT:    [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
-; I64-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
-; I64-NEXT:    [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
 ; I64-NEXT:    [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0
-; I64-NEXT:    store double [[TMP84]], ptr [[TMP76]], align 8
 ; I64-NEXT:    [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1
-; I64-NEXT:    store double [[TMP85]], ptr [[TMP77]], align 8
+; I64-NEXT:    [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
 ; I64-NEXT:    [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0
-; I64-NEXT:    store double [[TMP86]], ptr [[TMP78]], align 8
 ; I64-NEXT:    [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1
-; I64-NEXT:    store double [[TMP87]], ptr [[TMP79]], align 8
+; I64-NEXT:    [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
 ; I64-NEXT:    [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0
-; I64-NEXT:    store double [[TMP88]], ptr [[TMP80]], align 8
 ; I64-NEXT:    [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1
-; I64-NEXT:    store double [[TMP89]], ptr [[TMP81]], align 8
+; I64-NEXT:    [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
 ; I64-NEXT:    [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0
-; I64-NEXT:    store double [[TMP90]], ptr [[TMP82]], align 8
 ; I64-NEXT:    [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1
+; I64-NEXT:    [[TMP93:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
+; I64-NEXT:    [[TMP94:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
+; I64-NEXT:    [[TMP95:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; I64-NEXT:    [[TMP96:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; I64-NEXT:    [[TMP97:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP98:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP99:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT:    store double [[TMP84]], ptr [[TMP93]], align 8
+; I64-NEXT:    store double [[TMP85]], ptr [[TMP94]], align 8
+; I64-NEXT:    store double [[TMP86]], ptr [[TMP95]], align 8
+; I64-NEXT:    store double [[TMP87]], ptr [[TMP96]], align 8
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP97]], align 8
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP98]], align 8
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP99]], align 8
 ; I64-NEXT:    store double [[TMP91]], ptr [[TMP83]], align 8
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; I64-NEXT:    [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
@@ -928,17 +928,17 @@ define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %s
 ; I32-NEXT:    [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2
 ; I32-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3
 ; I32-NEXT:    [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]]
+; I32-NEXT:    [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
+; I32-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
+; I32-NEXT:    [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
+; I32-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
 ; I32-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
 ; I32-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]]
 ; I32-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]]
 ; I32-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]]
-; I32-NEXT:    [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
 ; I32-NEXT:    store double [[TMP41]], ptr [[TMP37]], align 8
-; I32-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
 ; I32-NEXT:    store double [[TMP42]], ptr [[TMP38]], align 8
-; I32-NEXT:    [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
 ; I32-NEXT:    store double [[TMP43]], ptr [[TMP39]], align 8
-; I32-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
 ; I32-NEXT:    store double [[TMP44]], ptr [[TMP40]], align 8
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; I32-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bdbac7c..005696a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -53,11 +53,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP11]], <16 x i64> [[TMP12]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP13]], i32 16, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 16 [[TMP13]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint <16 x i64> [[VEC_IND3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP11]], <16 x i64> [[TMP15]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP16]], i32 8, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 8 [[TMP16]], <16 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <16 x i64> [[VEC_IND3]], splat (i64 32)
@@ -96,11 +96,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <8 x i64> [[TMP18]], [[VEC_IND20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP19]], <8 x i64> [[TMP20]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP21]], i32 16, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 16 [[TMP21]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP22:%.*]] = or disjoint <8 x i64> [[VEC_IND20]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <8 x i64> [[TMP18]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP19]], <8 x i64> [[TMP23]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP24]], i32 8, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 8 [[TMP24]], <8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX14]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT16]] = add <8 x i64> [[VEC_IND15]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT21]] = add <8 x i64> [[VEC_IND20]], splat (i64 16)
@@ -110,18 +110,18 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N23:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[CMP_N23]], label %[[FOR_COND_CLEANUP_LOOPEXIT99]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[ITER_CHECK22]]:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
 ; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = add nuw i64 [[TMP27]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK25:%.*]] = icmp ult i64 [[TMP28]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label %[[VEC_EPILOG_SCALAR_PH40:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK24:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK25]], label %[[VEC_EPILOG_SCALAR_PH42:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK24:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK24]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK28:%.*]] = icmp ult i64 [[TMP28]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label %[[VEC_EPILOG_PH42:.*]], label %[[VECTOR_PH25:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK28]], label %[[VEC_EPILOG_PH45:.*]], label %[[VECTOR_PH25:.*]]
 ; CHECK:       [[VECTOR_PH25]]:
 ; CHECK-NEXT:    [[N_MOD_VF31:%.*]] = urem i64 [[TMP28]], 16
 ; CHECK-NEXT:    [[N_VEC32:%.*]] = sub i64 [[TMP28]], [[N_MOD_VF31]]
@@ -131,27 +131,27 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END41:%.*]] = add i64 8, [[TMP29]]
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
-; CHECK-NEXT:    br label %[[VECTOR_BODY29:.*]]
-; CHECK:       [[VECTOR_BODY29]]:
-; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, %[[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], %[[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY29]] ]
-; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], %[[VECTOR_BODY29]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY30:.*]]
+; CHECK:       [[VECTOR_BODY30]]:
+; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, %[[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], %[[VECTOR_BODY30]] ]
+; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %[[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], %[[VECTOR_BODY30]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = sub nsw <16 x i64> splat (i64 8), [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <16 x i64> [[VEC_IND35]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = add nsw <16 x i64> [[TMP30]], [[VEC_IND37]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP32]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[TMP34]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 16 [[TMP33]], <16 x i1> [[TMP34]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = or disjoint <16 x i64> [[VEC_IND37]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP36:%.*]] = add nsw <16 x i64> [[TMP30]], [[TMP49]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [10 x i32], <16 x ptr> [[TMP31]], <16 x i64> [[TMP36]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> [[TMP37]], i32 8, <16 x i1> [[TMP34]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> [[TMP33]], i32 16, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> [[TMP37]], i32 8, <16 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 8), <16 x ptr> align 8 [[TMP37]], <16 x i1> [[TMP34]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 16 [[TMP33]], <16 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> splat (i32 7), <16 x ptr> align 8 [[TMP37]], <16 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[INDEX_NEXT39]] = add nuw i64 [[INDEX34]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK37:.*]], label %[[VECTOR_BODY29]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP41]], label %[[MIDDLE_BLOCK37:.*]], label %[[VECTOR_BODY30]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK37]]:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK43:.*]]
@@ -160,9 +160,9 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_MOD_VF31]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH40]], label %[[VEC_EPILOG_PH42]], !prof [[PROF3]]
-; CHECK:       [[VEC_EPILOG_PH42]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH42]], label %[[VEC_EPILOG_PH45]], !prof [[PROF3]]
+; CHECK:       [[VEC_EPILOG_PH45]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL39:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[N_MOD_VF52:%.*]] = urem i64 [[TMP28]], 8
@@ -179,37 +179,37 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY49:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY49]]:
-; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], %[[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
-; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], %[[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
-; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], %[[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], %[[VEC_EPILOG_VECTOR_BODY49]] ]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY56:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY56]]:
+; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL39]], %[[VEC_EPILOG_PH45]] ], [ [[INDEX_NEXT74:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT66:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
+; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], %[[VEC_EPILOG_PH45]] ], [ [[VEC_IND_NEXT71:%.*]], %[[VEC_EPILOG_VECTOR_BODY56]] ]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP46]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[TMP48]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 16 [[TMP47]], <8 x i1> [[TMP48]])
 ; CHECK-NEXT:    [[TMP54:%.*]] = or disjoint <8 x i64> [[VEC_IND70]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP50:%.*]] = add nsw <8 x i64> [[TMP44]], [[TMP54]]
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x i32], <8 x ptr> [[TMP45]], <8 x i64> [[TMP50]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> [[TMP51]], i32 8, <8 x i1> [[TMP48]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> [[TMP47]], i32 16, <8 x i1> [[BROADCAST_SPLAT73]])
-; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> [[TMP51]], i32 8, <8 x i1> [[BROADCAST_SPLAT73]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 8), <8 x ptr> align 8 [[TMP51]], <8 x i1> [[TMP48]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 16 [[TMP47]], <8 x i1> [[BROADCAST_SPLAT73]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> splat (i32 7), <8 x ptr> align 8 [[TMP51]], <8 x i1> [[BROADCAST_SPLAT73]])
 ; CHECK-NEXT:    [[INDEX_NEXT74]] = add nuw i64 [[INDEX61]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[TMP55]], label %[[VEC_EPILOG_MIDDLE_BLOCK40:.*]], label %[[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK40]]:
+; CHECK-NEXT:    br i1 [[TMP55]], label %[[VEC_EPILOG_MIDDLE_BLOCK63:.*]], label %[[VEC_EPILOG_VECTOR_BODY56]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK63]]:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
-; CHECK-NEXT:    br i1 [[CMP_N65]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH40]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH40]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    br i1 [[CMP_N65]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH42]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH42]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL65:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END55]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL66:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END58]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
-; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US:.*]] ], [ [[BC_RESUME_VAL56]], %[[VEC_EPILOG_SCALAR_PH40]] ]
-; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL67]], %[[VEC_EPILOG_SCALAR_PH40]] ]
+; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US:.*]] ], [ [[BC_RESUME_VAL65]], %[[VEC_EPILOG_SCALAR_PH42]] ]
+; CHECK-NEXT:    [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL66]], %[[VEC_EPILOG_SCALAR_PH42]] ]
 ; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV78]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw i64 [[TMP56]], [[INDVARS_IV70]]
@@ -241,8 +241,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL18]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub nsw i64 8, [[INDVARS_IV95]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, i64 [[INDVARS_IV95]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = add nsw i64 [[TMP60]], [[INDVARS_IV87]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
index 272b62b..905e67b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -29,7 +29,7 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[DST]], i32 1, <16 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr align 1 [[DST]], <16 x i1> [[TMP0]])
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_1_LOOPEXIT1:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 15e2678..2ecd15e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -529,6 +529,14 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; CHECK-NEXT:    [[TMP18:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i8>
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP10]]
@@ -537,21 +545,13 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i8> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i8 [[TMP28]], ptr [[TMP20]], align 1
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <8 x i8> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i8 [[TMP29]], ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i8> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i8 [[TMP30]], ptr [[TMP22]], align 1
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <8 x i8> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i8 [[TMP31]], ptr [[TMP23]], align 1
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <8 x i8> [[TMP19]], i32 4
 ; CHECK-NEXT:    store i8 [[TMP32]], ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i8> [[TMP19]], i32 5
 ; CHECK-NEXT:    store i8 [[TMP33]], ptr [[TMP25]], align 1
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i8> [[TMP19]], i32 6
 ; CHECK-NEXT:    store i8 [[TMP34]], ptr [[TMP26]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i8> [[TMP19]], i32 7
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
@@ -608,53 +608,53 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <32 x i32> [[WIDE_VEC]], <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
 ; MAX-BW-NEXT:    [[TMP34:%.*]] = add <16 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]]
 ; MAX-BW-NEXT:    [[TMP35:%.*]] = trunc <16 x i32> [[TMP34]] to <16 x i8>
-; MAX-BW-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
-; MAX-BW-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
-; MAX-BW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
-; MAX-BW-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
-; MAX-BW-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
-; MAX-BW-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
-; MAX-BW-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
-; MAX-BW-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
-; MAX-BW-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
-; MAX-BW-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
-; MAX-BW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
-; MAX-BW-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
-; MAX-BW-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
-; MAX-BW-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
-; MAX-BW-NEXT:    [[TMP50:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
-; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
 ; MAX-BW-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[TMP35]], i32 0
-; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP36]], align 1
 ; MAX-BW-NEXT:    [[TMP53:%.*]] = extractelement <16 x i8> [[TMP35]], i32 1
-; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP37]], align 1
 ; MAX-BW-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[TMP35]], i32 2
-; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP38]], align 1
 ; MAX-BW-NEXT:    [[TMP55:%.*]] = extractelement <16 x i8> [[TMP35]], i32 3
-; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP39]], align 1
 ; MAX-BW-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[TMP35]], i32 4
-; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP40]], align 1
 ; MAX-BW-NEXT:    [[TMP57:%.*]] = extractelement <16 x i8> [[TMP35]], i32 5
-; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP41]], align 1
 ; MAX-BW-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[TMP35]], i32 6
-; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP42]], align 1
 ; MAX-BW-NEXT:    [[TMP59:%.*]] = extractelement <16 x i8> [[TMP35]], i32 7
-; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP43]], align 1
 ; MAX-BW-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[TMP35]], i32 8
-; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP44]], align 1
 ; MAX-BW-NEXT:    [[TMP61:%.*]] = extractelement <16 x i8> [[TMP35]], i32 9
-; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP45]], align 1
 ; MAX-BW-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[TMP35]], i32 10
-; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP46]], align 1
 ; MAX-BW-NEXT:    [[TMP63:%.*]] = extractelement <16 x i8> [[TMP35]], i32 11
-; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP47]], align 1
 ; MAX-BW-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[TMP35]], i32 12
-; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP48]], align 1
 ; MAX-BW-NEXT:    [[TMP65:%.*]] = extractelement <16 x i8> [[TMP35]], i32 13
-; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP49]], align 1
 ; MAX-BW-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[TMP35]], i32 14
-; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP50]], align 1
 ; MAX-BW-NEXT:    [[TMP67:%.*]] = extractelement <16 x i8> [[TMP35]], i32 15
+; MAX-BW-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; MAX-BW-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; MAX-BW-NEXT:    [[TMP71:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; MAX-BW-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP19]]
+; MAX-BW-NEXT:    [[TMP73:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP20]]
+; MAX-BW-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP21]]
+; MAX-BW-NEXT:    [[TMP75:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP22]]
+; MAX-BW-NEXT:    [[TMP76:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP23]]
+; MAX-BW-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP24]]
+; MAX-BW-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP25]]
+; MAX-BW-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP26]]
+; MAX-BW-NEXT:    [[TMP80:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP27]]
+; MAX-BW-NEXT:    [[TMP81:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP28]]
+; MAX-BW-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP29]]
+; MAX-BW-NEXT:    [[TMP83:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP30]]
+; MAX-BW-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[TMP31]]
+; MAX-BW-NEXT:    store i8 [[TMP52]], ptr [[TMP69]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP53]], ptr [[TMP70]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP54]], ptr [[TMP71]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP55]], ptr [[TMP72]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP56]], ptr [[TMP73]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP57]], ptr [[TMP74]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP58]], ptr [[TMP75]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP59]], ptr [[TMP76]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP60]], ptr [[TMP77]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP61]], ptr [[TMP78]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP62]], ptr [[TMP79]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP63]], ptr [[TMP80]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP64]], ptr [[TMP81]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP65]], ptr [[TMP82]], align 1
+; MAX-BW-NEXT:    store i8 [[TMP66]], ptr [[TMP83]], align 1
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 7069534..4c3fd29 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -18,12 +18,12 @@ define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr align 4 [[TMP7]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -67,12 +67,12 @@ define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noali
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 429)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr align 4 [[TMP7]], <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -134,9 +134,9 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B,
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP5]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP5]], <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP7]], <8 x i1> [[TMP4]], <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; CHECK-NEXT:    [[TMP10]] = add <8 x i32> [[TMP9]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
index 03a2a74..eca70b3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-gaps.ll
@@ -21,13 +21,13 @@ define void @load_store_interleave_group_with_gaps(ptr noalias %data, i64 nounde
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC]], <4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC]], <4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint <4 x i64> [[TMP0]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], <4 x i64> [[TMP3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC1]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC1]], <4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <4 x i64> [[TMP0]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[DATA]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC2]], <4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[STRIDED_VEC2]], <4 x ptr> align 8 [[TMP6]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 56f0b85..fda944e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -132,11 +132,11 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 19)
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP2]], <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr align 4 [[TMP4]], <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr [[TMP4]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0(<8 x float> [[TMP6]], ptr align 4 [[TMP4]], <8 x i1> [[TMP1]]), !llvm.access.group [[ACC_GRP8]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 1e94f83..c8e3766 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -28,12 +28,12 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP2]], <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP4]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr align 4 [[TMP4]], <16 x i1> [[TMP1]], <16 x i32> poison)
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr align 4 [[TMP7]], <16 x i1> [[TMP1]])
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -91,7 +91,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; NO-VP:       vec.epilog.iter.check:
 ; NO-VP-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; NO-VP-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; NO-VP:       vec.epilog.ph:
 ; NO-VP-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; NO-VP-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8
@@ -108,7 +108,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    store <8 x i32> [[TMP38]], ptr [[TMP39]], align 4
 ; NO-VP-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 8
 ; NO-VP-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC10]]
-; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[TMP41]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       vec.epilog.middle.block:
 ; NO-VP-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]]
 ; NO-VP-NEXT:    br i1 [[CMP_N11]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -126,7 +126,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
 ; NO-VP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       for.cond.cleanup:
 ; NO-VP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 455fe83..4068498 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -69,7 +69,7 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
-; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr [[TMP13]], i32 1, <48 x i1> [[TMP15]])
+; CHECK-NEXT:    call void @llvm.masked.store.v48i8.p0(<48 x i8> <i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison, i8 0, i8 poison, i8 0, i8 poison, i8 poison, i8 poison>, ptr align 1 [[TMP13]], <48 x i1> [[TMP15]]), !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
index 4fb928d..38617d2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-native-inner-loop-only.ll
@@ -32,12 +32,12 @@ define void @test(ptr %A) {
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[STRIDED_VEC]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP13]], i32 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP13]], i32 1
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index 9ea9e11..efc9a4f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -26,6 +26,7 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
@@ -33,16 +34,15 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], splat (i64 128)
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD5]], splat (i64 128)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD6]], splat (i64 128)
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i64, ptr [[TMP28]], i32 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP28]], i32 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i64, ptr [[TMP28]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr [[TMP28]], i32 4, <4 x i1> [[TMP16]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr [[TMP33]], i32 4, <4 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr [[TMP34]], i32 4, <4 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr [[TMP35]], i32 4, <4 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP12]], ptr align 4 [[TMP28]], <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP13]], ptr align 4 [[TMP33]], <4 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP14]], ptr align 4 [[TMP34]], <4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP15]], ptr align 4 [[TMP35]], <4 x i1> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
@@ -105,10 +105,10 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[TMP24]], i32 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP24]], i32 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP24]], i32 12
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr [[TMP24]], i32 4, <4 x i1> [[TMP16]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr [[TMP29]], i32 4, <4 x i1> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr [[TMP30]], i32 4, <4 x i1> [[TMP18]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr [[TMP31]], i32 4, <4 x i1> [[TMP19]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD]], ptr align 4 [[TMP24]], <4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD1]], ptr align 4 [[TMP29]], <4 x i1> [[TMP17]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD2]], ptr align 4 [[TMP30]], <4 x i1> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[WIDE_LOAD3]], ptr align 4 [[TMP31]], <4 x i1> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index f29428c..efcc000 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -122,7 +122,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -143,10 +143,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP3]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP3]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1016
@@ -295,7 +295,7 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -317,10 +317,10 @@ define dso_local void @masked_strided1_optsize(ptr noalias nocapture readonly %p
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[TMP3]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[TMP3]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP4]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -484,7 +484,7 @@ define dso_local void @masked_strided1_optsize_unknown_tc(ptr noalias nocapture
 ; DISABLED_MASKED_STRIDED:       pred.load.continue16:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr align 1 [[TMP52]], <8 x i1> [[TMP3]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -516,10 +516,10 @@ define dso_local void @masked_strided1_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[TMP5]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[TMP5]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -688,7 +688,7 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; DISABLED_MASKED_STRIDED:       pred.load.continue16:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr [[TMP52]], i32 1, <8 x i1> [[TMP3]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP51]], ptr align 1 [[TMP52]], <8 x i1> [[TMP3]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -720,10 +720,10 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr [[TMP3]], i32 1, <24 x i1> [[TMP5]], <24 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP6]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -794,20 +794,20 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i64 3
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i64 4
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i64 5
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i64 6
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i64 7
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP7]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP9]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP11]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP13]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i32 [[TMP15]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP2]], align 1
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP4]], align 1
@@ -841,7 +841,7 @@ define dso_local void @unconditional_strided1_optsize(ptr noalias nocapture read
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP0]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP1]], <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP2]], align 1
@@ -988,7 +988,7 @@ define dso_local void @unconditional_strided1_optsize_unknown_tc(ptr noalias noc
 ; DISABLED_MASKED_STRIDED:       pred.load.continue14:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr [[TMP50]], i32 1, <8 x i1> [[TMP0]])
+; DISABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP49]], ptr align 1 [[TMP50]], <8 x i1> [[TMP0]])
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1017,10 +1017,10 @@ define dso_local void @unconditional_strided1_optsize_unknown_tc(ptr noalias noc
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[TMP3]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[TMP3]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], i32 1, <8 x i1> [[TMP0]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP4]], <8 x i1> [[TMP0]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -1416,14 +1416,14 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP6]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2550,14 +2550,14 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP3]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP8]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2982,14 +2982,14 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[TMP2]], <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[Q:%.*]], i32 [[TMP1]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP6]], <16 x i1> [[INTERLEAVED_MASK]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index 414394a..500e603 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -27,41 +27,41 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP0]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP1]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP1]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP1]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP1]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS:%.*]], i64 [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP4]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP6]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP8]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP10]], ptr [[TMP3]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP11]], ptr [[TMP5]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP12]], ptr [[TMP7]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP13]], ptr [[TMP9]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP14]], align 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = or disjoint <4 x i64> [[TMP1]], splat (i64 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP15]], i64 0
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP15]], i64 1
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP15]], i64 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP15]], i64 3
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP16]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP18]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP20]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i16, ptr [[POINTS]], i64 [[TMP22]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP24]], ptr [[TMP17]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 1
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP25]], ptr [[TMP19]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 2
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP26]], ptr [[TMP21]], align 2
-; DISABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i64 3
 ; DISABLED_MASKED_STRIDED-NEXT:    store i16 [[TMP27]], ptr [[TMP23]], align 2
 ; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
@@ -82,7 +82,7 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i16>, ptr [[TMP2]], align 2
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr align 2 [[GEP]], <16 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -140,7 +140,7 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP1]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw <4 x i64> [[VEC_IND]], splat (i64 2)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -179,7 +179,7 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; DISABLED_MASKED_STRIDED:       pred.store.continue6:
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
-; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP19]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; DISABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP19]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = or disjoint <4 x i64> [[TMP2]], splat (i64 1)
 ; DISABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
@@ -245,15 +245,15 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP1]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP1]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 3
 ; ENABLED_MASKED_STRIDED-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[POINTS:%.*]], i64 [[TMP2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[INDEX]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP3]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison)
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr align 2 [[TMP3]], <4 x i1> [[TMP0]], <4 x i16> poison)
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> <i32 0, i32 4, i32 poison, i32 poison, i32 1, i32 5, i32 poison, i32 poison, i32 2, i32 6, i32 poison, i32 poison, i32 3, i32 7, i32 poison, i32 poison>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr [[GEP]], i32 2, <16 x i1> [[TMP5]])
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i16.p0(<16 x i16> [[INTERLEAVED_VEC]], ptr align 2 [[GEP]], <16 x i1> [[TMP5]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP6]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index f293ed1..deef94a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -21,7 +21,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP3]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[TMP3]], <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
 ; CHECK-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
@@ -61,7 +61,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; SINK-GATHER-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
-; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
+; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 [[TMP3]], <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
 ; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.sdiv.if:
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
index 368361f..0f55d79 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-interleaved-access.ll
@@ -15,7 +15,7 @@ for.cond.cleanup:
 
 for.body:
   %i.09 = phi i16 [ 0, %entry ], [ %add3, %for.body ]
-  %res.08 = phi x86_fp80 [ undef, %entry ], [ %3, %for.body ]
+  %res.08 = phi x86_fp80 [ zeroinitializer, %entry ], [ %3, %for.body ]
   %arrayidx = getelementptr inbounds x86_fp80, ptr %a, i16 %i.09
   %0 = load x86_fp80, ptr %arrayidx, align 1
   %add = or i16 %i.09, 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
index c9fb05c..d29719d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -33,11 +33,11 @@ define void @example() {
 ; FORCED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; FORCED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; FORCED-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[VEC_IND]] to <2 x x86_fp80>
+; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0
+; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1
 ; FORCED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP0]]
 ; FORCED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x x86_fp80], ptr @x, i64 0, i64 [[TMP1]]
-; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 0
 ; FORCED-NEXT:    store x86_fp80 [[TMP5]], ptr [[TMP3]], align 16
-; FORCED-NEXT:    [[TMP6:%.*]] = extractelement <2 x x86_fp80> [[TMP2]], i32 1
 ; FORCED-NEXT:    store x86_fp80 [[TMP6]], ptr [[TMP4]], align 16
 ; FORCED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; FORCED-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -100,8 +100,8 @@ define void @test_replicating_store_x86_fp80_cost(i32 %n, ptr %dst) #0 {
 ; FORCED-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; FORCED-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[VEC_IND]] to <2 x i64>
 ; FORCED-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; FORCED-NEXT:    [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]]
 ; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; FORCED-NEXT:    [[TMP6:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP5]]
 ; FORCED-NEXT:    [[TMP8:%.*]] = getelementptr x86_fp80, ptr [[DST]], i64 [[TMP7]]
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP6]], align 16
 ; FORCED-NEXT:    store x86_fp80 0xK00000000000000000000, ptr [[TMP8]], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 224ec4a6..b41ddeb 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -15,15 +15,15 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+02)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+02)
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP4]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_LOAD]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index b7aa958..784ccd2 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -18,20 +18,20 @@ define void @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    store i32 4, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    store i32 4, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    store i32 4, ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 4, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 1fc4a01..4540551 100644
--- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -988,30 +988,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP47]], i32 2
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP56]], i32 3
 ; CHECK-NEXT:    [[TMP25:%.*]] = sub <4 x i32> [[TMP24]], [[TMP12]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
 ; CHECK-NEXT:    [[TMP39:%.*]] = sub <4 x i32> [[TMP40]], [[TMP40]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP30]], ptr [[TMP26]], align 8
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP31]], ptr [[TMP27]], align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP32]], ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3
 ; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <4 x i32> [[TMP39]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP52]], ptr [[TMP48]], align 8
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP39]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[TMP49]], align 8
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP39]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[TMP50]], align 8
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP39]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[TMP51]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1078,30 +1078,30 @@ define void @pointer_iv_non_uniform_0(ptr %a, i64 %n) {
 ; INTER-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; INTER-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <16 x i32> [[WIDE_VEC5]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; INTER-NEXT:    [[TMP17:%.*]] = sub <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
+; INTER-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1
+; INTER-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
+; INTER-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
 ; INTER-NEXT:    [[TMP18:%.*]] = sub <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC4]]
+; INTER-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
+; INTER-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
+; INTER-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
+; INTER-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
 ; INTER-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 2
 ; INTER-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 2
 ; INTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 2
 ; INTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 2
-; INTER-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP17]], i32 0
 ; INTER-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 8
-; INTER-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP17]], i32 1
 ; INTER-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 8
-; INTER-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP17]], i32 2
 ; INTER-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 8
-; INTER-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP17]], i32 3
 ; INTER-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 8
 ; INTER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP]], i32 3
 ; INTER-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP1]], i32 3
 ; INTER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP2]], i32 3
 ; INTER-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[NEXT_GEP3]], i32 3
-; INTER-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0
 ; INTER-NEXT:    store i32 [[TMP23]], ptr [[TMP19]], align 8
-; INTER-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1
 ; INTER-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 8
-; INTER-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2
 ; INTER-NEXT:    store i32 [[TMP25]], ptr [[TMP27]], align 8
-; INTER-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3
 ; INTER-NEXT:    store i32 [[TMP26]], ptr [[TMP22]], align 8
 ; INTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTER-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1316,9 +1316,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]]
@@ -1382,9 +1382,9 @@ define i32 @pointer_iv_mixed(ptr %a, ptr %b, i64 %n) {
 ; INTER-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; INTER-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; INTER-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; INTER-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; INTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; INTER-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX]]
-; INTER-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; INTER-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 8, !alias.scope [[META20:![0-9]+]]
 ; INTER-NEXT:    [[TMP7]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; INTER-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8, !alias.scope [[META23:![0-9]+]], !noalias [[META20]]
diff --git a/llvm/test/Transforms/LoopVectorize/cse-casts.ll b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
index e923560..fb45745 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-casts.ll
@@ -319,8 +319,9 @@ define void @preserve_flags_narrowing_extends_and_truncs(ptr noalias %A, ptr noa
 ; CHECK:       [[PRED_STORE_CONTINUE60]]:
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br [[EXIT:label %.*]]
-; CHECK:       [[SCALAR_PH:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -349,3 +350,45 @@ loop:
 exit:
   ret void
 }
+
+define void @simplified_cast_preserves_irflag_type(ptr noalias %p, ptr noalias %q, ptr noalias %r) {
+; CHECK-LABEL: define void @simplified_cast_preserves_irflag_type(
+; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], ptr noalias [[R:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[Q]], align 2
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[R]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %x = load i8, ptr %p
+  %x.i32 = zext i8 %x to i32
+  %trunc = trunc i32 %x.i32 to i16
+  store i16 %trunc, ptr %q
+  %x.i16 = zext i8 %x to i16
+  store i16 %x.i16, ptr %r
+  %iv.next = add i64 %iv, 2
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
new file mode 100644
index 0000000..c0692f3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-safe-divisor=false -S %s | FileCheck %s
+
+define void @multiple_vppredinstphi_with_same_predicate(ptr %A, i32 %d) {
+; CHECK-LABEL: define void @multiple_vppredinstphi_with_same_predicate(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_SDIV_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
+; CHECK:       [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_IF1]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP7]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ [[TMP5]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP8]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP10]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %iv
+  %l = load i32, ptr %gep.A
+  %c = icmp sgt i32 %l, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %div.0 = sdiv i32 -10, %d
+  %div.1 = sdiv i32 -10, %d
+  %add  = add i32 %div.1, %div.0
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %add, %then ], [ 0, %loop.header ]
+  store i32 %merge, ptr %gep.A
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @multiple_vppredinstphi_with_different_predicate(ptr %A, i32 %d) {
+; CHECK-LABEL: define void @multiple_vppredinstphi_with_different_predicate(
+; CHECK-SAME: ptr [[A:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_SDIV_CONTINUE6:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_SDIV_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
+; CHECK:       [[PRED_SDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_SDIV_IF]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
+; CHECK:       [[PRED_SDIV_IF1]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP7]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
+; CHECK:       [[PRED_SDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ [[TMP5]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP8]], %[[PRED_SDIV_IF1]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <2 x i1> [[TMP1]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP9]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], splat (i32 20)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x i1> [[TMP12]], <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
+; CHECK:       [[PRED_SDIV_IF3]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
+; CHECK:       [[PRED_SDIV_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x i32> [ poison, %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP16]], %[[PRED_SDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6]]
+; CHECK:       [[PRED_SDIV_IF5]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = sdiv i32 -10, [[D]]
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP19]], i32 1
+; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
+; CHECK:       [[PRED_SDIV_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <2 x i32> [ [[TMP17]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP20]], %[[PRED_SDIV_IF5]] ]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP21]], <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = add <2 x i32> [[PREDPHI]], [[PREDPHI7]]
+; CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i32 %iv
+  %l = load i32, ptr %gep.A
+  %c.0 = icmp sgt i32 %l, 0
+  br i1 %c.0, label %then.0, label %continue
+
+then.0:
+  %div.0 = sdiv i32 -10, %d
+  br label %continue
+
+continue:
+  %merge.0 = phi i32 [ %div.0, %then.0 ], [ 0, %loop.header ]
+  %c.1 = icmp sgt i32 %l, 20
+  br i1 %c.1, label %then.1, label %loop.latch
+
+then.1:
+  %div.1 = sdiv i32 -10, %d
+  br label %loop.latch
+
+loop.latch:
+  %merge.1 = phi i32 [ %div.1, %then.1 ], [ 0, %continue ]
+  %add = add i32 %merge.0, %merge.1
+  store i32 %add, ptr %gep.A
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index e56b18f..9e2e74d 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -17,13 +17,13 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK:       [[FOR_COND5_PREHEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP5:%.*]], %[[FOR_COND5_PREHEADER1]] ], !dbg [[DBG23:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[H]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true)), !dbg [[DBG24:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> align 4 [[TMP2]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG26]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG24]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true)), !dbg [[DBG24]]
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG27:![0-9]+]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], splat (i64 5), !dbg [[DBG28:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0, !dbg [[DBG29:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index bcea03a..5177d7b 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -15,8 +15,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
@@ -144,8 +144,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 2) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
@@ -219,8 +219,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -294,8 +294,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -369,8 +369,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[TMP3]], i64 4) ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4
@@ -514,6 +514,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
@@ -536,10 +538,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i32> [ [[TMP10]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], %[[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP12]], <2 x i32> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP20]], i64 4), "dereferenceable"(ptr [[TMP20]], i64 4) ]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP19]], i64 4), "dereferenceable"(ptr [[TMP19]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP3]], i64 4), "dereferenceable"(ptr [[TMP3]], i64 4) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP30]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 2
@@ -593,8 +593,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP4]], i64 4), "dereferenceable"(ptr [[TMP4]], i64 4) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP5]], i64 4), "dereferenceable"(ptr [[TMP5]], i64 4) ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
index c07dc88..7659613 100644
--- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
+++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll
@@ -48,14 +48,14 @@ define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture read
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x ptr> [[BROADCAST_SPLAT]], <4 x ptr> [[BROADCAST_SPLAT9]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr float, ptr [[TMP11]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP29]], i64 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr float, ptr [[TMP13]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x ptr> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP15]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP31]], i64 12
 ; CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/histograms.ll b/llvm/test/Transforms/LoopVectorize/histograms.ll
index f0ceae7..5bb8722 100644
--- a/llvm/test/Transforms/LoopVectorize/histograms.ll
+++ b/llvm/test/Transforms/LoopVectorize/histograms.ll
@@ -16,8 +16,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP3]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
index 220fd64..712c75d 100644
--- a/llvm/test/Transforms/LoopVectorize/i8-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -20,7 +20,7 @@ scalar.ph:
 
 for.body:
   %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
-  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %c.015 = phi i8 [ 0, %scalar.ph ], [ %conv8, %for.body ]
   %conv2 = sext i8 %c.015 to i32
   %tobool = icmp ne i8 %c.015, 0
   %.sink = select i1 %tobool, i8 %c.015, i8 %0
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f7376a0..c164c4a 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -277,7 +277,7 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_2_LCSSA:%.*]] = phi i32 [ [[INEWCHUNKS_2]], [[FOR_INC23]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC26]]
 ; UNROLL-NOSIMPLIFY:       for.inc26:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
+; UNROLL-NOSIMPLIFY-NEXT:    [[INEWCHUNKS_1_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY9]] ], [ [[INEWCHUNKS_2_LCSSA]], [[FOR_INC26_LOOPEXIT]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    unreachable
 ;
 ; VEC-LABEL: @bug18724(
@@ -376,7 +376,7 @@ for.inc23:
   br i1 %cmp13, label %for.body14, label %for.inc26
 
 for.inc26:
-  %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+  %iNewChunks.1.lcssa = phi i32 [ 0, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
   unreachable
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 9e75002..5cf99b8 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -53,8 +53,8 @@ thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-sp
   br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
 
 .lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
-  %d.020 = phi ptr [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
-  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %d.020 = phi ptr [ zeroinitializer, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ zeroinitializer, %thread-pre-split.loopexit ], [ zeroinitializer, %thread-pre-split.preheader ]
   br i1 %arg, label %11, label %22
 
 ; <label>:11                                      ; preds = %.lr.ph21
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index cc55a51..e339953 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1247,8 +1247,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1293,8 +1293,8 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; IND-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
 ; IND-NEXT:    [[TMP7:%.*]] = xor <2 x i32> [[TMP6]], [[BROADCAST_SPLAT]]
 ; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0
-; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1
+; IND-NEXT:    store i32 [[TMP8]], ptr [[TMP1]], align 8
 ; IND-NEXT:    store i32 [[TMP9]], ptr [[TMP2]], align 8
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1347,14 +1347,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
 ; UNROLL-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP10]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP15]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
 ; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP15]], i64 1
-; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP14]], [[BROADCAST_SPLAT]]
 ; UNROLL-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i64 0
-; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP16]], i64 1
+; UNROLL-NEXT:    store i32 [[TMP17]], ptr [[TMP3]], align 8
+; UNROLL-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
 ; UNROLL-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1408,14 +1408,14 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = xor <2 x i32> [[TMP11]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = xor <2 x i32> [[TMP15]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP18]], ptr [[TMP4]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP19]], ptr [[TMP5]], align 8
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP20]], ptr [[TMP6]], align 8
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP21]], ptr [[TMP7]], align 8
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1473,22 +1473,22 @@ define void @scalarize_induction_variable_03(ptr %p, i32 %y, i64 %n) {
 ; INTERLEAVE-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
-; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP17]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP17]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP17]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
 ; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP17]], i64 3
-; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = xor <4 x i32> [[STRIDED_VEC2]], [[BROADCAST_SPLAT]]
 ; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0
-; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
 ; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1
-; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2
-; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3
+; INTERLEAVE-NEXT:    store i32 [[TMP19]], ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP20]], ptr [[TMP10]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP21]], ptr [[TMP11]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP22]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP23]], ptr [[TMP13]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP24]], ptr [[TMP14]], align 8
+; INTERLEAVE-NEXT:    store i32 [[TMP25]], ptr [[TMP15]], align 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP26]], ptr [[TMP16]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1568,10 +1568,10 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP13]], align 1, !alias.scope [[META17:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP26]], align 1, !alias.scope [[META17:![0-9]+]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
@@ -1630,16 +1630,16 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP10:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
 ; IND-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP10]], i64 0
-; IND-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i64 1
+; IND-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; IND-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
-; IND-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP12]], align 1, !alias.scope [[META17:![0-9]+]]
+; IND-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 1, !alias.scope [[META17:![0-9]+]]
 ; IND-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 1, !alias.scope [[META17]]
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; IND-NEXT:    [[TMP17:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i64 12
-; IND-NEXT:    store i32 [[TMP24]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; IND-NEXT:    store i32 [[TMP25]], ptr [[TMP16]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
 ; IND-NEXT:    store i32 [[TMP15]], ptr [[TMP18]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
@@ -1694,20 +1694,20 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP12:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
+; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 2)
 ; UNROLL-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 8)
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i64 0
-; UNROLL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i64 1
+; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
+; UNROLL-NEXT:    [[TMP35:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
 ; UNROLL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; UNROLL-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP13]], i64 0
 ; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; UNROLL-NEXT:    [[TMP20:%.*]] = extractelement <2 x i64> [[TMP13]], i64 1
 ; UNROLL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; UNROLL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP15]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17]]
-; UNROLL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
-; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP35]]
+; UNROLL-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP17]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP39:%.*]] = load i32, ptr [[TMP21]], align 1, !alias.scope [[META17]]
+; UNROLL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP36]], align 1, !alias.scope [[META17]]
 ; UNROLL-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
@@ -1716,9 +1716,9 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 20
 ; UNROLL-NEXT:    [[TMP28:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[TMP28]], i64 28
-; UNROLL-NEXT:    store i32 [[TMP35]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
-; UNROLL-NEXT:    store i32 [[TMP36]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; UNROLL-NEXT:    store i32 [[TMP37]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP37]], ptr [[TMP23]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP38]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
+; UNROLL-NEXT:    store i32 [[TMP39]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NEXT:    store i32 [[TMP22]], ptr [[TMP29]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
@@ -1779,19 +1779,19 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = shl nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = shl nsw <2 x i64> [[STEP_ADD]], splat (i64 2)
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP15]]
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP19]]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
-; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP16]], align 1, !alias.scope [[META17:![0-9]+]]
-; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17]]
-; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
-; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP37]]
+; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP20]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP22]], align 1, !alias.scope [[META17]]
+; UNROLL-NO-IC-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP38]], align 1, !alias.scope [[META17]]
 ; UNROLL-NO-IC-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]], i32 1
@@ -1859,7 +1859,15 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[DOTIDX5:%.*]] = shl nsw i64 [[TMP14]], 4
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[DOTIDX5]]
 ; INTERLEAVE-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP18]], align 1, !alias.scope [[META17:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
+; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
+; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
 ; INTERLEAVE-NEXT:    [[WIDE_VEC3:%.*]] = load <16 x i32>, ptr [[TMP19]], align 1, !alias.scope [[META17]]
+; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
+; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
+; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
+; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
 ; INTERLEAVE-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
 ; INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
@@ -1876,21 +1884,13 @@ define void @scalarize_induction_variable_04(ptr %a, ptr %p, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP24]], i64 52
 ; INTERLEAVE-NEXT:    [[TMP26:%.*]] = getelementptr [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i64 60
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 0
 ; INTERLEAVE-NEXT:    store i32 [[TMP28]], ptr [[TMP41]], align 1, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 4
 ; INTERLEAVE-NEXT:    store i32 [[TMP29]], ptr [[TMP16]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP30:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP30]], ptr [[TMP42]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i32> [[WIDE_VEC]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP31]], ptr [[TMP20]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 0
 ; INTERLEAVE-NEXT:    store i32 [[TMP32]], ptr [[TMP21]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 4
 ; INTERLEAVE-NEXT:    store i32 [[TMP33]], ptr [[TMP23]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP34:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 8
 ; INTERLEAVE-NEXT:    store i32 [[TMP34]], ptr [[TMP25]], align 1, !alias.scope [[META20]], !noalias [[META17]]
-; INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i32> [[WIDE_VEC3]], i64 12
 ; INTERLEAVE-NEXT:    store i32 [[TMP35]], ptr [[TMP27]], align 1, !alias.scope [[META20]], !noalias [[META17]]
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2445,11 +2445,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; CHECK-NEXT:    store i16 [[TMP10]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2492,13 +2492,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; IND-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IND-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; IND-NEXT:    [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
+; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
+; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; IND-NEXT:    [[TMP16:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; IND-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP16]], i64 6
-; IND-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i64 0
 ; IND-NEXT:    store i16 [[TMP8]], ptr [[TMP6]], align 2
-; IND-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
 ; IND-NEXT:    store i16 [[TMP9]], ptr [[TMP7]], align 2
 ; IND-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; IND-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
@@ -2544,7 +2544,11 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
+; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
+; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
+; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
+; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; UNROLL-NEXT:    [[TMP24:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
@@ -2553,13 +2557,9 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP25]], i64 10
 ; UNROLL-NEXT:    [[TMP26:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP26]], i64 14
-; UNROLL-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP8]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP14]], ptr [[TMP10]], align 2
-; UNROLL-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP8]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i64 0
 ; UNROLL-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP9]], i64 1
 ; UNROLL-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 4)
@@ -2610,18 +2610,18 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[TMP3]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP4]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP5]], i32 1
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I16]], ptr [[P]], i64 [[TMP6]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP15]], ptr [[TMP11]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP16]], ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP17]], ptr [[TMP13]], align 2
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
 ; UNROLL-NO-IC-NEXT:    store i16 [[TMP18]], ptr [[TMP14]], align 2
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
@@ -2666,7 +2666,15 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
+; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i16>
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
+; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
+; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 2
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
@@ -2683,21 +2691,13 @@ define void @iv_vector_and_scalar_users(ptr %p, i32 %a, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP40]], i64 26
 ; INTERLEAVE-NEXT:    [[TMP41:%.*]] = getelementptr [[PAIR_I16]], ptr [[P]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP41]], i64 30
-; INTERLEAVE-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[TMP12]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP22]], ptr [[TMP14]], align 2
-; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP12]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP23]], ptr [[TMP15]], align 2
-; INTERLEAVE-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP12]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP24]], ptr [[TMP16]], align 2
-; INTERLEAVE-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[TMP12]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP25]], ptr [[TMP17]], align 2
-; INTERLEAVE-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[TMP13]], i64 0
 ; INTERLEAVE-NEXT:    store i16 [[TMP26]], ptr [[TMP18]], align 2
-; INTERLEAVE-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[TMP13]], i64 1
 ; INTERLEAVE-NEXT:    store i16 [[TMP27]], ptr [[TMP19]], align 2
-; INTERLEAVE-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[TMP13]], i64 2
 ; INTERLEAVE-NEXT:    store i16 [[TMP28]], ptr [[TMP20]], align 2
-; INTERLEAVE-NEXT:    [[TMP29:%.*]] = extractelement <4 x i16> [[TMP13]], i64 3
 ; INTERLEAVE-NEXT:    store i16 [[TMP29]], ptr [[TMP21]], align 2
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 0ebb652..16a56f3 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -149,8 +149,8 @@ define void @interleaved_with_cond_store_1(ptr %p, i64 %x, i64 %n) {
 ; CHECK:       pred.store.continue2:
 ; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 0
-; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[WIDE_VEC3]], i64 2
+; CHECK-NEXT:    store i64 [[TMP11]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index b4cad11..8efe29a 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -557,6 +557,10 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1021, [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 1020, [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P1]], i64 [[TMP1]]
@@ -578,21 +582,17 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i64 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <4 x i64> [[TMP19]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP21]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP3]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP22]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP3]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP23]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP3]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP24]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i64 0
 ; CHECK-NEXT:    store i64 [[TMP25]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP20]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i64 2
 ; CHECK-NEXT:    store i64 [[TMP27]], ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP20]], i64 3
 ; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
@@ -791,8 +791,8 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 undef, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -822,8 +822,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
-  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %SumB.014 = phi float [ 0.0e+00, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %a = getelementptr inbounds %struct.IntFloat, ptr %A, i64 %indvars.iv, i32 0
   %tmp = load i32, ptr %a, align 4
   %add = add nsw i32 %tmp, %SumA.013
@@ -888,12 +888,12 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -971,12 +971,12 @@ define i32 @PR27626_1(ptr %p, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP20]], i64 28
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1073,12 +1073,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1150,29 +1150,29 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i64 0
 ; CHECK-NEXT:    [[DOTSPLIT3:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT3]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[DOTSPLIT4:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT4]], i64 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i64 2
 ; CHECK-NEXT:    [[DOTSPLIT5:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT5]], i64 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP2]], i64 3
 ; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT6]], i64 4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 0
-; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 2
-; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 4
-; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_VEC]], i64 6
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[TMP14]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[TMP16]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1347,7 +1347,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 5, i64 7, i64 9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -3)
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP12]], i64 12
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
@@ -1356,21 +1364,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP34]], i64 28
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP35]], i64 36
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP7]], i64 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP7]], i64 3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP8]], i64 0
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP8]], i64 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP28]]
 ; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP17]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 9358fd9..00256a5 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -480,8 +480,8 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp sge <2 x i8> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 2
@@ -551,8 +551,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
index ffeb3b1..f8ddd34 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -192,8 +192,8 @@ define void @no_gep_or_bitcast(ptr noalias %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index 3c59a27..fe25d1b 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -447,8 +447,8 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD3:%.*]] = add <2 x i32> [[VEC_IND1]], splat (i32 2)
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[INDEX]], !custom_md [[META2:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[VEC_IND]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 0
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B]], <2 x i64> [[STEP_ADD]]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2
 ; INTERLEAVE-NEXT:    store <2 x i32> [[VEC_IND1]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    store <2 x i32> [[STEP_ADD3]], ptr [[TMP5]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
index 741ca55..1675a59 100644
--- a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
+++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
@@ -48,7 +48,7 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup, !dbg !33
 
 for.cond.cleanup:
-  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ 0, %entry ], !dbg !33
   %sub = add nsw i32 %0, -5, !dbg !33
   %idxprom3 = sext i32 %sub to i64, !dbg !33
   %arrayidx4 = getelementptr inbounds i32, ptr %vla, i64 %idxprom3, !dbg !33
diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
index 659dc62..2038633 100644
--- a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
@@ -22,8 +22,8 @@ entry:
   br label %for.body
 
 for.body:
-  %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
-  %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+  %inc107 = phi i32 [ 0, %entry ], [ %inc10, %for.body ]
+  %inc6 = phi i32 [ %nf.promoted, %entry ], [ 3, %for.body ]
   %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
   %.neg2 = sub i32 0, %inc6
   %add.neg = add i32 0, %add55
diff --git a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
index 53dad3a..7b0c366 100644
--- a/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/narrow-to-single-scalar.ll
@@ -92,12 +92,12 @@ define void @narrow_to_single_scalar_store_address_not_uniform_across_all_parts(
 ; VF4IC1-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
 ; VF4IC1-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[VEC_IND]], splat (i32 1)
 ; VF4IC1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP5]]
 ; VF4IC1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; VF4IC1-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP7]]
 ; VF4IC1-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; VF4IC1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP9]]
 ; VF4IC1-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; VF4IC1-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP5]]
+; VF4IC1-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP7]]
+; VF4IC1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP9]]
 ; VF4IC1-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i32 [[TMP11]]
 ; VF4IC1-NEXT:    store i32 [[TMP0]], ptr [[TMP6]], align 4
 ; VF4IC1-NEXT:    store i32 [[TMP1]], ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 1319d06..bfc7fee 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -802,17 +802,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 6
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 2)
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 8)
@@ -838,17 +838,17 @@ define void @multiple_ivs_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], 6
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[VEC_IND2]], splat (i32 2)
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP18]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP24]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP26]], ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP27]], ptr [[TMP23]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], splat (i32 8)
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 9f82795..763072a 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -262,8 +262,8 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -288,8 +288,8 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -314,8 +314,8 @@ define void @pr43371() optsize {
 ; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -363,8 +363,8 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
@@ -389,8 +389,8 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
 ; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
index 568a0db..8a77d14 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
@@ -22,14 +22,14 @@ define void @inner_latch_header_first_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i64], ptr @A, i64 0, <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER3:.*]]
 ; CHECK:       [[INNER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i64], ptr @B, i64 0, <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i64> [[WIDE_MASKED_GATHER5]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
@@ -37,8 +37,8 @@ define void @inner_latch_header_first_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[VECTOR_LATCH]], label %[[INNER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -124,14 +124,14 @@ define void @inner_latch_header_second_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i64], ptr @A, i64 0, <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER3:.*]]
 ; CHECK:       [[INNER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[INNER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i64], ptr @B, i64 0, <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 4 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i64> [[WIDE_MASKED_GATHER5]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP3]] = add nsw <4 x i64> [[TMP2]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
@@ -139,8 +139,8 @@ define void @inner_latch_header_second_successor(i64 %N, i32 %c, i64 %M) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER3]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP9]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
index 32b1fc4..2e17f7a 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
@@ -28,9 +28,9 @@ define void @test(ptr %src, i64 %n) {
 ; CHECK:       loop.32:
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, [[LOOP_2_HEADER1]] ], [ [[TMP2:%.*]], [[LOOP_32]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [2000 x i32], ptr [[SRC:%.*]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI3]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], splat (i32 10)
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI3]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
index c0dc1cc..59e3d71 100644
--- a/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer-loop-wide-phis.ll
@@ -14,7 +14,7 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER_HEADER1:.*]]
 ; CHECK:       [[INNER_HEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH3:.*]] ]
@@ -22,7 +22,7 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN2:.*]], label %[[INNER_LATCH3]]
 ; CHECK:       [[THEN2]]:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    br label %[[INNER_LATCH3]]
 ; CHECK:       [[INNER_LATCH3]]:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN2]] ], [ zeroinitializer, %[[INNER_HEADER1]] ]
@@ -33,8 +33,8 @@ define void @wide_phi_2_predecessors(ptr noalias %A, ptr noalias %B, i32 %c, i1
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
@@ -122,7 +122,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    br label %[[INNER_HEADER1:.*]]
 ; CHECK:       [[INNER_HEADER1]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[INNER_LATCH3:.*]] ]
@@ -130,7 +130,7 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[B]], <4 x i64> [[VEC_PHI]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN2:.*]], label %[[INNER_LATCH3]]
 ; CHECK:       [[THEN2]]:
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x i64> poison)
 ; CHECK-NEXT:    br label %[[INNER_LATCH3]]
 ; CHECK:       [[INNER_LATCH3]]:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i64> [ [[WIDE_MASKED_GATHER]], %[[THEN2]] ], [ zeroinitializer, %[[INNER_HEADER1]] ]
@@ -141,8 +141,8 @@ define void @wide_phi_2_predecessors_phi_ops_swapped(ptr noalias %A, ptr noalias
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[INNER_HEADER1]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[VEC_PHI6]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP3]], %[[INNER_LATCH3]] ]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> [[TMP10]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
index 7e90724..b2f1954 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
@@ -41,12 +41,12 @@ define void @non_outermost_loop_hcfg_construction(i64 %n, ptr %a) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[MIDDLE_LOOP_LATCH4]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[A]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP3]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    br label [[INNERMOST_LOOP3:%.*]]
 ; CHECK:       innermost.loop3:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[INNERMOST_LOOP3]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
@@ -187,9 +187,9 @@ define void @non_outermost_loop_hcfg_construction_other_loops_at_same_level(i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr [[INVARIANT_GEP]], <4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP3]], i32 8, <4 x i1> splat (i1 true), <4 x ptr> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> align 8 [[TMP3]], <4 x i1> splat (i1 true), <4 x ptr> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP4]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
index 70ce7a7..4774375 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll
@@ -34,13 +34,13 @@ define void @foo() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH4:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[OUTER_LOOP_LATCH4]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    br label [[INNER_LOOP1:%.*]]
 ; CHECK:       inner_loop1:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP13:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x float> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, <vscale x 4 x i64> [[VEC_PHI]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
 ; CHECK-NEXT:    [[TMP12]] = fmul <vscale x 4 x float> [[VEC_PHI2]], [[WIDE_MASKED_GATHER3]]
 ; CHECK-NEXT:    [[TMP13]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq <vscale x 4 x i64> [[TMP13]], splat (i64 512)
@@ -48,7 +48,7 @@ define void @foo() {
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[OUTER_LOOP_LATCH4]], label [[INNER_LOOP1]]
 ; CHECK:       vector.latch:
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x float> [ [[TMP12]], [[INNER_LOOP1]] ]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI5]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VEC_PHI5]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
index c68a6d1..4086c79 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -35,13 +35,13 @@ define void @foo(i32 %n) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> align 4 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY31:.*]]
 ; CHECK:       [[FOR_BODY31]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP4:%.*]], %[[FOR_BODY31]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
index 29e6333..fb9b1c7 100644
--- a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -23,11 +23,11 @@
 ; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
 ; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
 ; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @A, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true))
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[CSplat]], <4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true))
 ; CHECK: br i1 %[[ZeroTripChk]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
 
 ; CHECK: [[InnerForPh]]:
-; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
 ; CHECK: br label %[[InnerForBody:.*]]
 
@@ -35,7 +35,7 @@
 ; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ zeroinitializer, %[[InnerForPh]] ], [ %[[InnerIndNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[WideAVal]], %[[InnerForPh]] ], [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ]
 ; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], ptr @B, i64 0, <4 x i64> %[[InnerInd]]
-; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %[[BAddr]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 %[[BAddr]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
 ; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
 ; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], splat (i64 1)
@@ -45,7 +45,7 @@
 
 ; CHECK: [[InnerCrit]]:
 ; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> splat (i1 true))
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StorePhi]], <4 x ptr> align 4 %[[AAddr]], <4 x i1> splat (i1 true))
 ; CHECK:  br label %[[ForInc]]
 
 ; CHECK: [[ForInc]]:
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
index bf23485..cebf90a 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction-index-width-smaller-than-iv-width.ll
@@ -17,11 +17,14 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP2]]
@@ -29,11 +32,8 @@ define void @wide_ptr_induction_index_width_smaller_than_iv_width(ptr noalias %s
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[DST_0]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <4 x i64> [[WIDE_LOAD]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    store ptr [[TMP13]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
 ; CHECK-NEXT:    store ptr [[TMP14]], ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 32
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index 5c04e4c..5c62ca3 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -147,11 +147,11 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[VECTOR_GEP]], i64 1
 ; CHECK-NEXT:    store <4 x ptr> [[TMP2]], ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[WIDE_LOAD]], splat (i8 1)
 ; CHECK-NEXT:    store <4 x i8> [[TMP6]], ptr [[TMP4]], align 1
@@ -551,12 +551,12 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) {
 ; STRIDED-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; STRIDED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP21]], align 8
 ; STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 0
-; STRIDED-NEXT:    store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8
 ; STRIDED-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 1
-; STRIDED-NEXT:    store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8
 ; STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 2
-; STRIDED-NEXT:    store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8
 ; STRIDED-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[WIDE_LOAD]], i32 3
+; STRIDED-NEXT:    store i64 [[TMP23]], ptr [[NEXT_GEP]], align 8
+; STRIDED-NEXT:    store i64 [[TMP24]], ptr [[NEXT_GEP1]], align 8
+; STRIDED-NEXT:    store i64 [[TMP16]], ptr [[NEXT_GEP2]], align 8
 ; STRIDED-NEXT:    store i64 [[TMP25]], ptr [[NEXT_GEP3]], align 8
 ; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; STRIDED-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll
index e1c1e20..0f509a5 100644
--- a/llvm/test/Transforms/LoopVectorize/pr34681.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll
@@ -62,12 +62,12 @@ define i32 @foo1(i32 %N, ptr nocapture readnone %A, ptr nocapture readonly %B, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[TMP13]], align 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[TMP15]], align 2
@@ -167,12 +167,12 @@ define i32 @foo2(i16 zeroext %N, ptr nocapture readnone %A, ptr nocapture readon
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[TMP12]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index 32762a4..1bb6454 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -54,12 +54,12 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP8]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 6542c42..cf973af 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -131,8 +131,8 @@ define void @widen_ptr_induction_dbg(ptr %start, ptr %end) {
 ; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEBUGLOC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG35:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 8, i64 16, i64 24>, !dbg [[DBG35]]
-; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0, !dbg [[DBG36:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36]]
+; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0
+; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36:![0-9]+]]
 ; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; DEBUGLOC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32, !dbg [[DBG35]]
 ; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG37:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index ec7fde8..964a257 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -1207,11 +1207,11 @@ for.end:
 define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK:       [[_LR_PH1:.*:]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1231,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-INTERLEAVED-NEXT:  [[_LR_PH1:.*]]:
+; CHECK-INTERLEAVED-NEXT:  [[_LR_PH:.*]]:
 ; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-INTERLEAVED:       [[_LR_PH1:.*:]]
+; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1947,7 +1947,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -1966,7 +1966,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2002,7 +2002,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2033,7 +2033,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2063,7 +2063,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2100,7 +2100,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], zeroinitializer
@@ -2122,7 +2122,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK:       [[FOR_BODY2]]:
 ; CHECK-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2159,7 +2159,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H]], i32 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
@@ -2196,7 +2196,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ undef, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY2:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_BODY2]]:
 ; CHECK-INTERLEAVED-NEXT:    [[A_117:%.*]] = phi i32 [ [[INC6:%.*]], %[[FOR_INC5:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
@@ -2227,7 +2227,7 @@ entry:
 
 for.body2:                                        ; preds = %entry, %for.inc5
   %a.117 = phi i32 [ %inc6, %for.inc5 ], [ 0, %entry ]
-  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ undef, %entry ]
+  %g.016 = phi i32 [ %g.1, %for.inc5 ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i8, ptr %h, i32 %a.117
   %0 = load i8, ptr %arrayidx, align 1
   %tobool3.not = icmp eq i8 %0, 0
@@ -2263,7 +2263,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[PRED_LOAD_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
@@ -2340,7 +2340,7 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE15:.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ undef, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP94:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP98:%.*]], %[[PRED_LOAD_CONTINUE15]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
@@ -2480,7 +2480,7 @@ for.cond.cleanup:                                 ; preds = %for.inc
 
 for.body:                                         ; preds = %entry, %for.inc
   %g.09 = phi i32 [ 0, %entry ], [ %inc3, %for.inc ]
-  %a.08 = phi i32 [ undef, %entry ], [ %a.1, %for.inc ]
+  %a.08 = phi i32 [ 0, %entry ], [ %a.1, %for.inc ]
   %d = getelementptr inbounds [0 x %struct.e], ptr %b, i32 0, i32 %g.09, i32 1
   %0 = load i32, ptr %d, align 4
   %tobool.not = icmp eq i32 %0, 0
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index 7dd2995..c708715 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -348,16 +348,16 @@ define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META16]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META16]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META16]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META16]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META16]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
@@ -554,16 +554,16 @@ define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4, !alias.scope [[META23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4, !alias.scope [[META23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP37]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP23]], align 4, !alias.scope [[META23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP38]], align 4, !alias.scope [[META23]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4, !alias.scope [[META23]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 5894c3a..c270a23 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -41,8 +41,8 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) {
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x ptr> [[WIDE_LOAD]], <2 x ptr> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 0
-; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[REVERSE]], i32 1
+; CHECK-NEXT:    store ptr null, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    store ptr null, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
index 43e916b..6576675 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -17,8 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
 ;CHECK: ret
 define i32 @foo(i32 %n, ptr nocapture %A) {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
+  br label %.lr.ph
 
 .lr.ph:                                           ; preds = %0
   %2 = sext i32 %n to i64
@@ -26,7 +25,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
 
 ; <label>:3                                       ; preds = %.lr.ph, %3
   %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
-  %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ]
+  %sum.01 = phi i32 [ 0, %.lr.ph ], [ %9, %3 ]
   %4 = trunc i64 %indvars.iv to i32
   %5 = shl nsw i32 %4, 1
   %6 = sext i32 %5 to i64
@@ -38,8 +37,7 @@ define i32 @foo(i32 %n, ptr nocapture %A) {
   %11 = icmp sgt i32 %10, 0
   br i1 %11, label %3, label %._crit_edge
 
-._crit_edge:                                      ; preds = %3, %0
-  %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ]
-  ret i32 %sum.0.lcssa
+._crit_edge:                                      ; preds = %3
+  ret i32 %9
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
index 667df3a..c858f20 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -22,10 +22,10 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x float> [[WIDE_LOAD1]], i32 0
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp ogt float [[TMP10]], 1.000000e+02
 ; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt float [[TMP12]], 1.000000e+02
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2]])
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 60da336..1216bc1 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -178,13 +178,14 @@ for.exit:
 define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF1-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF1-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -202,7 +203,7 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP10]], align 4
 ; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
@@ -225,25 +226,25 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[SCALAR_PH]]:
 ; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF1:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF1-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF1:       [[SCALAR_BODY]]:
 ;
 ; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
 ; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
-; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
-; CHECK-VF4UF2-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-VF4UF2-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
-; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
@@ -261,8 +262,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 2
@@ -296,19 +297,17 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[SCALAR_PH]]:
 ; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
-; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
 ; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
 ; CHECK-VF4UF2:       [[FOR_COND_CLEANUP]]:
-; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
 ; CHECK-VF4UF2-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
 ; CHECK-VF4UF2:       [[SCALAR_BODY]]:
 ;
 entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+  br label %for.preheader
 
 for.preheader:
   %arrayidx2.phi.trans.insert = getelementptr inbounds i32, ptr %a, i64 -1
@@ -320,13 +319,12 @@ for.cond.cleanup.loopexit:
   br label %for.cond.cleanup
 
 for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
+  ret i32 %minmax.0.cond.lcssa
 
 scalar.body:
   %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
   %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %minmax.028 = phi i32 [ 0, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
   %1 = load i32, ptr %arrayidx, align 4
   %sub3 = sub nsw i32 %1, %0
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 58ad64d..8224d6b 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -16,7 +16,7 @@ entry:
   br label %for.cond
 
 for.cond:                                         ; preds = %for.cond, %entry
-  %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ]
+  %i.0 = phi i32 [ poison, %entry ], [ %inc, %for.cond ]
   %cmp = icmp slt i32 %i.0, 0
   %fsub = fsub double undef, undef
   %fadd = fadd double %fsub, 1.000000e+00
@@ -36,7 +36,7 @@ for.cond7.preheader.lr.ph:                        ; preds = %for.cond4.preheader
 for.cond7.preheader:                              ; preds = %for.cond7.preheader.lr.ph, %for.inc23
   %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
   %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
-  %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
+  %n.015 = phi i32 [ poison, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
   %1 = load i32, ptr @b, align 4, !tbaa !5
   %tobool11 = icmp eq i32 %1, 0
   br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
diff --git a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
index 58542f4..163faa2 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-predicate-reasoning.ll
@@ -29,16 +29,16 @@ define void @step_direction_unknown(i32 %arg, ptr %dst) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[TMP8]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP9]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP16]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP11]], align 8
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP15]], align 8
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index f4d5a84..cd9eb72 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -23,7 +23,7 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
@@ -33,7 +33,7 @@ entry:
 
 loop:
   %iv = phi i32 [ 1, %entry ], [ %add, %loop ]
-  %red = phi i64 [ undef, %entry ], [ %select, %loop ]
+  %red = phi i64 [ poison, %entry ], [ %select, %loop ]
   %gep = getelementptr inbounds i32, ptr %src, i32 %iv
   %l = load i32, ptr %gep
   %c = icmp eq i32 %l, 1
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index cc3bda4..5cdde05 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -387,7 +387,7 @@ exit:
   ret i64 %res
 }
 
-; TODO: The existing assumptions should be strong enough to vectorize this.
+; The existing assumptions is strong enough to vectorize this.
 define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %first, ptr align 2 %last) nofree nosync {
 ; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_attribute_argument(
 ; CHECK-SAME: ptr align 2 [[FIRST:%.*]], ptr align 2 [[LAST:%.*]]) #[[ATTR0]] {
@@ -401,18 +401,55 @@ define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %fi
 ; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
 ; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
 ; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
-; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
-; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], %[[EXIT_LOOPEXIT]] ]
@@ -444,7 +481,7 @@ exit:
   ret ptr %first.addr.0.lcssa.i
 }
 
-; TODO: The existing assumptions should be strong enough to vectorize this.
+; The existing assumptions is strong enough to vectorize this.
 define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last) nofree nosync {
 ; CHECK-LABEL: define ptr @find_deref_pointer_distance_align_assumption(
 ; CHECK-SAME: ptr [[FIRST:%.*]], ptr [[LAST:%.*]]) #[[ATTR0]] {
@@ -458,18 +495,55 @@ define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last)
 ; CHECK-NEXT:    [[C_0:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
 ; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT:.*]], label %[[LOOP_HEADER_PREHEADER:.*]]
 ; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LAST_I64]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST_I64]]
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP4]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 1)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
+; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i16 [[L]], 1
-; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[IV]], i64 2
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp eq ptr [[IV_NEXT]], [[LAST]]
-; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I_PH:%.*]] = phi ptr [ [[IV_NEXT]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[FIRST_ADDR_0_LCSSA_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[FIRST_ADDR_0_LCSSA_I_PH]], %[[EXIT_LOOPEXIT]] ]
@@ -522,7 +596,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 3bb39b9..cde2de7 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -142,8 +142,8 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
index 99916a5..8123092 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll
@@ -14,12 +14,12 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i64 } @fn1(float [[TMP2]]) #[[ATTR0:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i64 } @fn1(float [[TMP4]]) #[[ATTR0]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i64 } @fn1(float [[TMP6]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { i64 } @fn1(float [[TMP8]]) #[[ATTR0]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i64 } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
@@ -55,11 +55,13 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i64 } @fn1(float [[TMP3]]) #[[ATTR0]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i64 } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP8]], 0
@@ -67,10 +69,8 @@ define void @struct_return_1xi64_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP11:%.*]] = extractvalue { <2 x i64> } [[TMP9]], 0
 ; VF2IC2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i64 1
 ; VF2IC2-NEXT:    [[TMP13:%.*]] = insertvalue { <2 x i64> } [[TMP9]], <2 x i64> [[TMP12]], 0
-; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP14]]) #[[ATTR0]]
-; VF2IC2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP15:%.*]] = tail call { i64 } @fn1(float [[TMP16]]) #[[ATTR0]]
+; VF2IC2-NEXT:    [[TMP17:%.*]] = tail call { i64 } @fn1(float [[TMP5]]) #[[ATTR0]]
 ; VF2IC2-NEXT:    [[TMP18:%.*]] = extractvalue { i64 } [[TMP15]], 0
 ; VF2IC2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i64> poison, i64 [[TMP18]], i64 0
 ; VF2IC2-NEXT:    [[TMP20:%.*]] = insertvalue { <2 x i64> } poison, <2 x i64> [[TMP19]], 0
@@ -120,12 +120,12 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { float, float } @fn2(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { float, float } @fn2(float [[TMP4]]) #[[ATTR1]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { float, float } @fn2(float [[TMP6]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { float, float } @fn2(float [[TMP8]]) #[[ATTR1]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { float, float } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0
@@ -180,11 +180,13 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR1]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { float, float } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP8]], 0
@@ -200,10 +202,8 @@ define void @struct_return_2xf32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP19:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP17]], 1
 ; VF2IC2-NEXT:    [[TMP20:%.*]] = insertelement <2 x float> [[TMP19]], float [[TMP18]], i64 1
 ; VF2IC2-NEXT:    [[TMP21:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP17]], <2 x float> [[TMP20]], 1
-; VF2IC2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP22]]) #[[ATTR1]]
-; VF2IC2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP23:%.*]] = tail call { float, float } @fn2(float [[TMP24]]) #[[ATTR1]]
+; VF2IC2-NEXT:    [[TMP25:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR1]]
 ; VF2IC2-NEXT:    [[TMP26:%.*]] = extractvalue { float, float } [[TMP23]], 0
 ; VF2IC2-NEXT:    [[TMP27:%.*]] = insertelement <2 x float> poison, float [[TMP26]], i64 0
 ; VF2IC2-NEXT:    [[TMP28:%.*]] = insertvalue { <2 x float>, <2 x float> } poison, <2 x float> [[TMP27]], 0
@@ -271,12 +271,12 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF4-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 0
-; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 1
-; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
-; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; VF4-NEXT:    [[TMP3:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
+; VF4-NEXT:    [[TMP5:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP4]]) #[[ATTR2]]
+; VF4-NEXT:    [[TMP7:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP6]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP9:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP8]]) #[[ATTR2]]
 ; VF4-NEXT:    [[TMP10:%.*]] = extractvalue { i32, i32, i32 } [[TMP3]], 0
 ; VF4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
@@ -350,11 +350,13 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]]
 ; VF2IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i32 2
 ; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
 ; VF2IC2-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; VF2IC2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
-; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
+; VF2IC2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
+; VF2IC2-NEXT:    [[TMP4:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2:[0-9]+]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP3]]) #[[ATTR2]]
 ; VF2IC2-NEXT:    [[TMP7:%.*]] = extractvalue { i32, i32, i32 } [[TMP4]], 0
 ; VF2IC2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i64 0
 ; VF2IC2-NEXT:    [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP8]], 0
@@ -378,10 +380,8 @@ define void @struct_return_3xi32_replicate(ptr noalias %in, ptr noalias writeonl
 ; VF2IC2-NEXT:    [[TMP27:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], 2
 ; VF2IC2-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[TMP26]], i64 1
 ; VF2IC2-NEXT:    [[TMP29:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP25]], <2 x i32> [[TMP28]], 2
-; VF2IC2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 0
-; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP30]]) #[[ATTR2]]
-; VF2IC2-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[WIDE_LOAD1]], i32 1
-; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP31:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP32]]) #[[ATTR2]]
+; VF2IC2-NEXT:    [[TMP33:%.*]] = tail call { i32, i32, i32 } @fn3(i32 [[TMP5]]) #[[ATTR2]]
 ; VF2IC2-NEXT:    [[TMP34:%.*]] = extractvalue { i32, i32, i32 } [[TMP31]], 0
 ; VF2IC2-NEXT:    [[TMP35:%.*]] = insertelement <2 x i32> poison, i32 [[TMP34]], i64 0
 ; VF2IC2-NEXT:    [[TMP36:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP35]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
index 63ca454..abdd5e9 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-args-call-variants.ll
@@ -76,10 +76,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[TMP0]]) #[[ATTR0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @foo(double [[TMP2]], i64 [[INDEX]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP3]], i64 [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i64 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP5]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP8]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 985a9a2..71311db 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -102,14 +102,14 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> undef
+; CHECK-NEXT:    [[PREDPHI2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[PREDPHI1]], <4 x i64> poison
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI2]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[TMP7]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP2]], align 2
 ; CHECK-NEXT:    store i16 0, ptr [[TMP4]], align 2
@@ -139,11 +139,11 @@ loop.next.2:
   br label %loop.next.3
 
 loop.next.3:
-  %blend.1 = phi i64 [ undef, %loop.next ], [ %iv, %loop.next.2 ]
+  %blend.1 = phi i64 [ poison, %loop.next ], [ %iv, %loop.next.2 ]
   br label %loop.latch
 
 loop.latch:                                       ; preds = %loop.next, %loop.header
-  %blend = phi i64 [ undef, %loop.header ], [ %blend.1, %loop.next.3 ]
+  %blend = phi i64 [ poison, %loop.header ], [ %blend.1, %loop.next.3 ]
   %dst.ptr = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 %blend
   store i16 0, ptr %dst.ptr
   %iv.next = add nuw nsw i64 %iv, 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 571c55c..927fefc 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -101,10 +101,10 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -153,20 +153,20 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -210,12 +210,12 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -257,20 +257,20 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -313,20 +313,20 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -368,20 +368,20 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -424,12 +424,12 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -509,10 +509,10 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -558,10 +558,10 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -610,20 +610,20 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -667,12 +667,12 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -714,20 +714,20 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -770,20 +770,20 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -826,20 +826,20 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -883,12 +883,12 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -931,10 +931,10 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv <2 x i16> [[TMP4]], splat (i16 6)
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i16> [[TMP5]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    store i16 [[TMP2]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index 6cf82fc..d6277d6 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -101,10 +101,10 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -153,20 +153,20 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -255,20 +255,20 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -310,20 +310,20 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -363,10 +363,10 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -415,20 +415,20 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
@@ -471,20 +471,20 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -2)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -527,20 +527,20 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[VEC_IND]], splat (i64 -3)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; CHECK-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9357adf..edf04bbc 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -18,28 +18,28 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP1:%.*]] = urem <8 x i64> [[TMP0]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP1]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP1]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP1]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> poison, i64 [[TMP18]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 1
@@ -94,28 +94,28 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = udiv <8 x i64> [[TMP0]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem <8 x i64> [[TMP1]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP14]], align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP2]], i32 6
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP2]], i32 7
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> poison, i64 [[TMP19]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 1
@@ -168,28 +168,28 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv <8 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> poison, i64 [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 1
@@ -317,28 +317,28 @@ define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, p
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
 ; CHECK-NEXT:    [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
-; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
-; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
-; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
-; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
-; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP34]]
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP38]]
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP40]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP47]], align 4
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
 ; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
 ; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 2b5d0f3..32873a4 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -105,16 +105,16 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -233,20 +233,20 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -272,34 +272,34 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
@@ -343,12 +343,12 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP5]], align 8
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
+; VF2-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -374,18 +374,18 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; VF4-NEXT:    [[TMP6:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP7]], ptr [[TMP11]], align 8
+; VF4-NEXT:    store i64 [[TMP8]], ptr [[TMP12]], align 8
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; VF4-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -427,20 +427,20 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], zeroinitializer
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -465,34 +465,34 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], zeroinitializer
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -534,20 +534,20 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
 ; VF2-NEXT:    [[TMP11:%.*]] = add nsw <2 x i64> [[TMP10]], splat (i64 42)
-; VF2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP12]], align 8
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
+; VF2-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
+; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP12]], ptr [[TMP14]], align 8
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -572,34 +572,34 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP13]], i32 0
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 1
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 2
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 3
 ; VF4-NEXT:    [[TMP21:%.*]] = add nsw <4 x i64> [[TMP20]], splat (i64 42)
-; VF4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
-; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP22]], align 8
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
+; VF4-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP21]], i32 0
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP21]], i32 1
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP21]], i32 2
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP21]], i32 3
+; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP22]], ptr [[TMP26]], align 8
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -640,10 +640,10 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF2-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
-; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP2]], align 8
+; VF2-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF2-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8
 ; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0
 ; VF2-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[TMP6]], i32 1
@@ -670,16 +670,16 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
 ; VF4-NEXT:    [[TMP0:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
+; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
+; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF4-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i32 0
 ; VF4-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 1
@@ -731,12 +731,12 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
 ; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; VF2-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP6]], ptr [[TMP8]], align 8
+; VF2-NEXT:    store i64 [[TMP7]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
 ; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -762,18 +762,18 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
 ; VF4-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[TMP7:%.*]] = add nsw <4 x i64> [[WIDE_LOAD]], splat (i64 42)
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    store i64 [[TMP12]], ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    store i64 [[TMP13]], ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP8]], ptr [[TMP12]], align 8
+; VF4-NEXT:    store i64 [[TMP9]], ptr [[TMP13]], align 8
+; VF4-NEXT:    store i64 [[TMP10]], ptr [[TMP14]], align 8
+; VF4-NEXT:    store i64 [[TMP11]], ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
 ; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -815,20 +815,20 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -854,34 +854,34 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -924,20 +924,20 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; VF2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
-; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP5]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
 ; VF2-NEXT:    [[TMP12:%.*]] = add nsw <2 x i64> [[TMP11]], splat (i64 42)
-; VF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
-; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP13]], align 8
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
+; VF2-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP12]], i32 0
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP12]], i32 1
+; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP13]], ptr [[TMP15]], align 8
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
@@ -963,34 +963,34 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 9
 ; VF4-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[VEC_IND]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
-; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP7]], align 8
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
 ; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP14]], i32 0
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 1
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 2
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 3
 ; VF4-NEXT:    [[TMP22:%.*]] = add nsw <4 x i64> [[TMP21]], splat (i64 42)
-; VF4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
-; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP23]], align 8
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
+; VF4-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3
+; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP23]], ptr [[TMP27]], align 8
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index 12851d7..607d136 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -18,10 +18,10 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -52,16 +52,16 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -143,16 +143,16 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -208,10 +208,10 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -242,16 +242,16 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -310,20 +310,20 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -353,34 +353,34 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -431,20 +431,20 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -474,34 +474,34 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -552,20 +552,20 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -595,34 +595,34 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -673,20 +673,20 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -715,34 +715,34 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -792,20 +792,20 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -834,34 +834,34 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -911,20 +911,20 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP3:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
-; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8
+; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i32 0
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP10]], i32 1
 ; VF2-NEXT:    [[TMP13:%.*]] = add nsw <2 x i64> [[TMP12]], splat (i64 42)
-; VF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
-; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP14]], align 8
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
+; VF2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i64> [[TMP13]], i32 0
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP13]], i32 1
+; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    store i64 [[TMP14]], ptr [[TMP16]], align 8
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -953,34 +953,34 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP5:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]]
 ; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP6]], i32 0
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
-; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
-; VF4-NEXT:    [[TMP13:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP13]]
-; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP8]], align 8
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP10]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP6]], i32 1
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
+; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 8
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
 ; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> poison, i64 [[TMP15]], i32 0
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> [[TMP19]], i64 [[TMP16]], i32 1
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 2
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 3
 ; VF4-NEXT:    [[TMP23:%.*]] = add nsw <4 x i64> [[TMP22]], splat (i64 42)
-; VF4-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
-; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP24]], align 8
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
+; VF4-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP23]], i32 0
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP23]], i32 1
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP23]], i32 2
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP23]], i32 3
+; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    store i64 [[TMP24]], ptr [[TMP28]], align 8
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1028,10 +1028,10 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1062,16 +1062,16 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1127,10 +1127,10 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1161,16 +1161,16 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1226,10 +1226,10 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[TMP0]], [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF2-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP4]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
 ; VF2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 1
@@ -1260,16 +1260,16 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[TMP0]], [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0
-; VF4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
-; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
-; VF4-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
-; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
-; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
-; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
-; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP4]], align 8
-; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP6]], align 8
-; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1
+; VF4-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2
+; VF4-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3
+; VF4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
+; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP5]]
+; VF4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF4-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF4-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP8]], align 8
+; VF4-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF4-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[TMP15:%.*]] = insertelement <4 x i64> poison, i64 [[TMP11]], i32 0
 ; VF4-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP12]], i32 1
@@ -1328,20 +1328,20 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1371,34 +1371,34 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1449,20 +1449,20 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1492,34 +1492,34 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1570,20 +1570,20 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1613,34 +1613,34 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1691,20 +1691,20 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1734,34 +1734,34 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 1)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1812,20 +1812,20 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1855,34 +1855,34 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
@@ -1933,20 +1933,20 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = udiv <2 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF2-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP3]], [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; VF2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
-; VF2-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP7]], align 8
+; VF2-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
+; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP6]]
+; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]]
+; VF2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 8
 ; VF2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
 ; VF2-NEXT:    [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP11]], i32 1
 ; VF2-NEXT:    [[TMP14:%.*]] = add nsw <2 x i64> [[TMP13]], splat (i64 42)
-; VF2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
-; VF2-NEXT:    store i64 [[TMP17]], ptr [[TMP15]], align 8
-; VF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
-; VF2-NEXT:    store i64 [[TMP18]], ptr [[TMP16]], align 8
+; VF2-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP14]], i32 0
+; VF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i64> [[TMP14]], i32 1
+; VF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF2-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
+; VF2-NEXT:    store i64 [[TMP16]], ptr [[TMP18]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
@@ -1976,34 +1976,34 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> [[VEC_IND1]], splat (i64 3)
 ; VF4-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; VF4-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP7]], i32 0
-; VF4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
-; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
-; VF4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
-; VF4-NEXT:    [[TMP12:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
-; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP12]]
-; VF4-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
-; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP14]]
-; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP9]], align 8
-; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP11]], align 8
-; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP7]], i32 1
+; VF4-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[TMP7]], i32 2
+; VF4-NEXT:    [[TMP11:%.*]] = extractelement <4 x i64> [[TMP7]], i32 3
+; VF4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]]
+; VF4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
+; VF4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]]
+; VF4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; VF4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP12]], align 8
+; VF4-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP13]], align 8
+; VF4-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP14]], align 8
 ; VF4-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP15]], align 8
 ; VF4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[TMP16]], i32 0
 ; VF4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP17]], i32 1
 ; VF4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i64> [[TMP21]], i64 [[TMP18]], i32 2
 ; VF4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i64> [[TMP22]], i64 [[TMP19]], i32 3
 ; VF4-NEXT:    [[TMP24:%.*]] = add nsw <4 x i64> [[TMP23]], splat (i64 42)
-; VF4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; VF4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
-; VF4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
-; VF4-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP25]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
-; VF4-NEXT:    store i64 [[TMP30]], ptr [[TMP26]], align 8
-; VF4-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
-; VF4-NEXT:    store i64 [[TMP31]], ptr [[TMP27]], align 8
-; VF4-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP28]], align 8
+; VF4-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP24]], i32 0
+; VF4-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP24]], i32 1
+; VF4-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP24]], i32 2
+; VF4-NEXT:    [[TMP28:%.*]] = extractelement <4 x i64> [[TMP24]], i32 3
+; VF4-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
+; VF4-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
+; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP3]]
+; VF4-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]]
+; VF4-NEXT:    store i64 [[TMP25]], ptr [[TMP29]], align 8
+; VF4-NEXT:    store i64 [[TMP26]], ptr [[TMP30]], align 8
+; VF4-NEXT:    store i64 [[TMP27]], ptr [[TMP31]], align 8
+; VF4-NEXT:    store i64 [[TMP28]], ptr [[TMP32]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 027dcaf..6a6ae31 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -330,12 +330,12 @@ define void @test_versioned_with_non_ex_use(i32 %offset, ptr noalias %dst.1, ptr
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST_1]], i32 [[TMP17]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[TMP14]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
index 9ace6be..e5e0267 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-native-path-inner-loop-with-runtime-checks.ll
@@ -61,29 +61,29 @@ define void @expand(ptr %src, ptr %dst, i64 %0) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[SRC]], align 8, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl <4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i64> [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP20]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i64> [[TMP20]], i32 2
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP22]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP24]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP31]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP26]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP33]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP28]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = or disjoint <4 x i64> [[TMP20]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i64> [[TMP29]], i32 0
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i64> [[TMP29]], i32 1
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i64> [[TMP29]], i32 2
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i64> [[TMP29]], i32 3
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP32]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP34]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP36]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP31]], align 8, !alias.scope [[META3]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP33]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP41]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP35]], align 8, !alias.scope [[META3]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP42]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[TMP37]], align 8, !alias.scope [[META3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
index 7aa7293..eaebfeb 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
@@ -22,9 +22,9 @@ define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noal
 ; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
 ; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
 ; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %[[A_PTR]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 %[[B_PTR]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
 
 ; CHECK: [[FOR2_HEADER]]:
@@ -39,7 +39,7 @@ define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noal
 ; CHECK: [[FOR1_LATCH]]:
 ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
 ; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> align 8 %[[C_PTR]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
 ; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], splat (i64 4)
 ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
index a4833f2..180fd84 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-call-instruction.ll
@@ -16,9 +16,9 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr no
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_LATCH]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[A_IN]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[B_IN]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP1]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP1]], <4 x i1> splat (i1 true), <4 x double> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_MASKED_GATHER1]])
 ; CHECK-NEXT:    br label %[[FOR2_HEADER2:.*]]
 ; CHECK:       [[FOR2_HEADER2]]:
@@ -30,9 +30,9 @@ define void @widen_call_instruction(ptr noalias nocapture readonly %a.in, ptr no
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[VECTOR_LATCH]], label %[[FOR2_HEADER2]]
 ; CHECK:       [[VECTOR_LATCH]]:
-; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x double> [ [[TMP3]], %[[FOR2_HEADER2]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ [[TMP3]], %[[FOR2_HEADER2]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[C_OUT]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[VEC_PHI4]], <4 x ptr> [[TMP7]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP9]], <4 x ptr> align 8 [[TMP7]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
index c782e09..48a11fa 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-widen-select-instruction.ll
@@ -29,7 +29,7 @@ define void @loop_invariant_select(ptr noalias nocapture %out, i1 %select, doubl
 ; CHECK:       [[FOR2_HEADER3]]:
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP2:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[SELECT]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP1]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP2]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
@@ -89,7 +89,7 @@ define void @outer_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP3]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -150,7 +150,7 @@ define void @inner_loop_dependant_select(ptr noalias nocapture %out, double %a,
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP3:%.*]], %[[FOR2_HEADER3]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP2]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP3]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
@@ -212,7 +212,7 @@ define void @outer_and_inner_loop_dependant_select(ptr noalias nocapture %out, d
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> [[TMP0]], i32 8, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP3]], <4 x ptr> align 8 [[TMP0]], <4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index 80f2977..e6ebe97 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -71,7 +71,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
 ; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[STRIDE:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i32> [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> align 4 [[STRIDE]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[LI:%.*]] = load <vscale x 4 x float>, ptr [[ALLOC]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[LI]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
index 47db666..2acbdae 100644
--- a/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather-xfail.ll
@@ -7,12 +7,12 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x
 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
 ; No scatter/gather calls should end up eliminated
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.scatter
-; CHECK: llvm.masked.gather
-; CHECK: llvm.masked.scatter
-; CHECK: llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.scatter
+; CHECK: call{{.*}}llvm.masked.gather
+; CHECK: call{{.*}}llvm.masked.scatter
+; CHECK: call{{.*}}llvm.masked.gather
 define spir_kernel void @test(<2 x ptr> %in1, <2 x ptr> %in2, ptr %out) {
 entry:
   ; Just some temporary storage
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 52d279a..e3765ed 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -114,14 +114,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <2 x i64> [[TMP8]], splat (i64 225)
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ult <2 x i64> [[TMP10]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP12]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP15]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP17]], i64 16
@@ -190,14 +190,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i64> poison, i64 [[TMP37]], i64 0
 ; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <2 x i64> [[TMP39]], i64 [[TMP38]], i64 1
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult <2 x i64> [[TMP36]], splat (i64 225)
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP41]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
 ; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <2 x i1> [[TMP41]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ult <2 x i64> [[TMP40]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <2 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
 ; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <2 x i1> [[TMP42]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP43]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP44]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP45]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP46]])
 ; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP33]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP47]], i64 16
@@ -267,14 +267,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i64> poison, i64 [[TMP68]], i64 0
 ; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i64> [[TMP70]], i64 [[TMP69]], i64 1
 ; CHECK-NEXT:    [[TMP72:%.*]] = icmp ult <2 x i64> [[TMP67]], splat (i64 225)
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i1> [[TMP72]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i1> [[TMP72]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ult <2 x i64> [[TMP71]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <2 x i1> [[TMP73]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
 ; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <2 x i1> [[TMP73]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP74]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP75]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP76]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP77]])
 ; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP64]]
 ; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP78]], i64 16
@@ -344,14 +344,14 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <2 x i64> poison, i64 [[TMP99]], i64 0
 ; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <2 x i64> [[TMP101]], i64 [[TMP100]], i64 1
 ; CHECK-NEXT:    [[TMP103:%.*]] = icmp ult <2 x i64> [[TMP98]], splat (i64 225)
-; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <2 x i1> [[TMP103]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
 ; CHECK-NEXT:    [[TMP106:%.*]] = extractelement <2 x i1> [[TMP103]], i64 1
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
+; CHECK-NEXT:    [[TMP104:%.*]] = icmp ult <2 x i64> [[TMP102]], splat (i64 225)
 ; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i1> [[TMP104]], i64 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i1> [[TMP104]], i64 1
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP105]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP106]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP107]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP108]])
 ; CHECK-NEXT:    [[TMP109:%.*]] = getelementptr inbounds nuw double, ptr [[A]], i64 [[TMP95]]
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP109]], i64 16
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index 0023dea..7fb72e6 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -23,10 +23,10 @@ define dso_local void @arm_add_q7(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32
 ; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[PSRCB:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[NEXT_GEP]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[NEXT_GEP15]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[NEXT_GEP]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[NEXT_GEP15]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> [[WIDE_MASKED_LOAD16]])
-; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP2]], ptr [[NEXT_GEP14]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP2]], ptr align 1 [[NEXT_GEP14]], <16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index 5e9fe8c..c186207 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -37,7 +37,7 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
 ; CHECK-NEXT:    br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[AND]])
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[PSRC_ADDR_0_LCSSA]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr align 1 [[PSRC_ADDR_0_LCSSA]], <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index a3af048..2c1d73e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -12,6 +12,400 @@
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
 
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = extractelement <2 x double> %A, i32 0
+  %2 = extractelement <2 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <2 x double> %A, i32 1
+  %4 = extractelement <2 x double> %B, i32 1
+  %add = fadd double %3, %4
+  %vecinsert1 = insertelement <2 x double> poison, double %sub, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = extractelement <4 x double> %A, i32 0
+  %2 = extractelement <4 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <4 x double> %A, i32 2
+  %4 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <4 x double> %A, i32 1
+  %6 = extractelement <4 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <4 x double> %A, i32 3
+  %8 = extractelement <4 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %vecinsert1 = insertelement <4 x double> poison, double %add, i32 1
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+  ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = extractelement <8 x double> %A, i32 0
+  %2 = extractelement <8 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <8 x double> %A, i32 2
+  %4 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <8 x double> %A, i32 1
+  %6 = extractelement <8 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <8 x double> %A, i32 3
+  %8 = extractelement <8 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %9 = extractelement <8 x double> %A, i32 4
+  %10 = extractelement <8 x double> %B, i32 4
+  %sub3 = fsub double %9, %10
+  %11 = extractelement <8 x double> %A, i32 6
+  %12 = extractelement <8 x double> %B, i32 6
+  %sub4 = fsub double %11, %12
+  %13 = extractelement <8 x double> %A, i32 5
+  %14 = extractelement <8 x double> %B, i32 5
+  %add3 = fadd double %13, %14
+  %15 = extractelement <8 x double> %A, i32 7
+  %16 = extractelement <8 x double> %B, i32 7
+  %add4 = fadd double %15, %16
+  %vecinsert1 = insertelement <8 x double> poison, double %add, i32 1
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+  ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+;
+  %v2 = extractelement <2 x float> %v0, i32 0
+  %v3 = extractelement <2 x float> %v1, i32 0
+  %v4 = extractelement <2 x float> %v0, i32 1
+  %v5 = extractelement <2 x float> %v1, i32 1
+  %sub = fsub float %v2, %v3
+  %add = fadd float %v5, %v4
+  %res0 = insertelement <2 x float> poison, float %sub, i32 0
+  %res1 = insertelement <2 x float> %res0, float %add, i32 1
+  ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = extractelement <8 x float> %A, i32 0
+  %2 = extractelement <8 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <8 x float> %A, i32 2
+  %4 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <8 x float> %A, i32 1
+  %6 = extractelement <8 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <8 x float> %A, i32 3
+  %8 = extractelement <8 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <8 x float> %A, i32 4
+  %10 = extractelement <8 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <8 x float> %A, i32 6
+  %12 = extractelement <8 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <8 x float> %A, i32 5
+  %14 = extractelement <8 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <8 x float> %A, i32 7
+  %16 = extractelement <8 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %vecinsert1 = insertelement <8 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+  ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT:    ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT:    ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = extractelement <16 x float> %A, i32 0
+  %2 = extractelement <16 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <16 x float> %A, i32 2
+  %4 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <16 x float> %A, i32 1
+  %6 = extractelement <16 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <16 x float> %A, i32 3
+  %8 = extractelement <16 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <16 x float> %A, i32 4
+  %10 = extractelement <16 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <16 x float> %A, i32 6
+  %12 = extractelement <16 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <16 x float> %A, i32 5
+  %14 = extractelement <16 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <16 x float> %A, i32 7
+  %16 = extractelement <16 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %17 = extractelement <16 x float> %A, i32 8
+  %18 = extractelement <16 x float> %B, i32 8
+  %sub5 = fsub float %17, %18
+  %19 = extractelement <16 x float> %A, i32 10
+  %20 = extractelement <16 x float> %B, i32 10
+  %sub6 = fsub float %19, %20
+  %21 = extractelement <16 x float> %A, i32 9
+  %22 = extractelement <16 x float> %B, i32 9
+  %add5 = fadd float %21, %22
+  %23 = extractelement <16 x float> %A, i32 11
+  %24 = extractelement <16 x float> %B, i32 11
+  %add6 = fadd float %23, %24
+  %25 = extractelement <16 x float> %A, i32 12
+  %26 = extractelement <16 x float> %B, i32 12
+  %sub7 = fsub float %25, %26
+  %27 = extractelement <16 x float> %A, i32 14
+  %28 = extractelement <16 x float> %B, i32 14
+  %sub8 = fsub float %27, %28
+  %29 = extractelement <16 x float> %A, i32 13
+  %30 = extractelement <16 x float> %B, i32 13
+  %add7 = fadd float %29, %30
+  %31 = extractelement <16 x float> %A, i32 15
+  %32 = extractelement <16 x float> %B, i32 15
+  %add8 = fadd float %31, %32
+  %vecinsert1 = insertelement <16 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+  ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> poison, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[VECINSERT2:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT2]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %4, %3
+  %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 poison, i32 0, i32 3, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+  ret <4 x float> %vecinsert2
+}
+
 define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @PR45015(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index 40dc2aa..fa6403f 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX
 
@@ -12,6 +12,404 @@
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 ; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
 
+define <2 x double> @test_addsub_v2f64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: @test_addsub_v2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = extractelement <2 x double> %A, i32 0
+  %2 = extractelement <2 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <2 x double> %A, i32 1
+  %4 = extractelement <2 x double> %B, i32 1
+  %add = fadd double %3, %4
+  %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
+  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+  ret <2 x double> %vecinsert2
+}
+
+define <4 x double> @test_addsub_v4f64(<4 x double> %A, <4 x double> %B) {
+; CHECK-LABEL: @test_addsub_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x double> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = extractelement <4 x double> %A, i32 0
+  %2 = extractelement <4 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <4 x double> %A, i32 2
+  %4 = extractelement <4 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <4 x double> %A, i32 1
+  %6 = extractelement <4 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <4 x double> %A, i32 3
+  %8 = extractelement <4 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
+  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+  ret <4 x double> %vecinsert4
+}
+
+define <8 x double> @test_addsub_v8f64(<8 x double> %A, <8 x double> %B) {
+; SSE2-LABEL: @test_addsub_v8f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x double> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v8f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x double> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %1 = extractelement <8 x double> %A, i32 0
+  %2 = extractelement <8 x double> %B, i32 0
+  %sub = fsub double %1, %2
+  %3 = extractelement <8 x double> %A, i32 2
+  %4 = extractelement <8 x double> %B, i32 2
+  %sub2 = fsub double %3, %4
+  %5 = extractelement <8 x double> %A, i32 1
+  %6 = extractelement <8 x double> %B, i32 1
+  %add = fadd double %5, %6
+  %7 = extractelement <8 x double> %A, i32 3
+  %8 = extractelement <8 x double> %B, i32 3
+  %add2 = fadd double %7, %8
+  %9 = extractelement <8 x double> %A, i32 4
+  %10 = extractelement <8 x double> %B, i32 4
+  %sub3 = fsub double %9, %10
+  %11 = extractelement <8 x double> %A, i32 6
+  %12 = extractelement <8 x double> %B, i32 6
+  %sub4 = fsub double %11, %12
+  %13 = extractelement <8 x double> %A, i32 5
+  %14 = extractelement <8 x double> %B, i32 5
+  %add3 = fadd double %13, %14
+  %15 = extractelement <8 x double> %A, i32 7
+  %16 = extractelement <8 x double> %B, i32 7
+  %add4 = fadd double %15, %16
+  %vecinsert1 = insertelement <8 x double> undef, double %add, i32 1
+  %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add2, i32 3
+  %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub, i32 0
+  %vecinsert4 = insertelement <8 x double> %vecinsert3, double %sub2, i32 2
+  %vecinsert5 = insertelement <8 x double> %vecinsert4, double %add3, i32 5
+  %vecinsert6 = insertelement <8 x double> %vecinsert5, double %add4, i32 7
+  %vecinsert7 = insertelement <8 x double> %vecinsert6, double %sub3, i32 4
+  %vecinsert8 = insertelement <8 x double> %vecinsert7, double %sub4, i32 6
+  ret <8 x double> %vecinsert8
+}
+
+define <2 x float> @test_addsub_v2f32(<2 x float> %v0, <2 x float> %v1) {
+; CHECK-LABEL: @test_addsub_v2f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[V0]], [[V1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+;
+  %v2 = extractelement <2 x float> %v0, i32 0
+  %v3 = extractelement <2 x float> %v1, i32 0
+  %v4 = extractelement <2 x float> %v0, i32 1
+  %v5 = extractelement <2 x float> %v1, i32 1
+  %sub = fsub float %v2, %v3
+  %add = fadd float %v5, %v4
+  %res0 = insertelement <2 x float> undef, float %sub, i32 0
+  %res1 = insertelement <2 x float> %res0, float %add, i32 1
+  ret <2 x float> %res1
+}
+
+define <4 x float> @test_addsub_v4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <8 x float> @test_v8f32(<8 x float> %A, <8 x float> %B) {
+; SSE2-LABEL: @test_v8f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE2-NEXT:    ret <8 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_v8f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; SSE4-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = extractelement <8 x float> %A, i32 0
+  %2 = extractelement <8 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <8 x float> %A, i32 2
+  %4 = extractelement <8 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <8 x float> %A, i32 1
+  %6 = extractelement <8 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <8 x float> %A, i32 3
+  %8 = extractelement <8 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <8 x float> %A, i32 4
+  %10 = extractelement <8 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <8 x float> %A, i32 6
+  %12 = extractelement <8 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <8 x float> %A, i32 5
+  %14 = extractelement <8 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <8 x float> %A, i32 7
+  %16 = extractelement <8 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+  ret <8 x float> %vecinsert8
+}
+
+define <16 x float> @test_addsub_v16f32(<16 x float> %A, <16 x float> %B) {
+; SSE2-LABEL: @test_addsub_v16f32(
+; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP4]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; SSE2-NEXT:    ret <16 x float> [[TMP5]]
+;
+; SSE4-LABEL: @test_addsub_v16f32(
+; SSE4-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; SSE4-NEXT:    ret <16 x float> [[TMP3]]
+;
+; AVX-LABEL: @test_addsub_v16f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; AVX-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %1 = extractelement <16 x float> %A, i32 0
+  %2 = extractelement <16 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <16 x float> %A, i32 2
+  %4 = extractelement <16 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <16 x float> %A, i32 1
+  %6 = extractelement <16 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <16 x float> %A, i32 3
+  %8 = extractelement <16 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %9 = extractelement <16 x float> %A, i32 4
+  %10 = extractelement <16 x float> %B, i32 4
+  %sub3 = fsub float %9, %10
+  %11 = extractelement <16 x float> %A, i32 6
+  %12 = extractelement <16 x float> %B, i32 6
+  %sub4 = fsub float %11, %12
+  %13 = extractelement <16 x float> %A, i32 5
+  %14 = extractelement <16 x float> %B, i32 5
+  %add3 = fadd float %13, %14
+  %15 = extractelement <16 x float> %A, i32 7
+  %16 = extractelement <16 x float> %B, i32 7
+  %add4 = fadd float %15, %16
+  %17 = extractelement <16 x float> %A, i32 8
+  %18 = extractelement <16 x float> %B, i32 8
+  %sub5 = fsub float %17, %18
+  %19 = extractelement <16 x float> %A, i32 10
+  %20 = extractelement <16 x float> %B, i32 10
+  %sub6 = fsub float %19, %20
+  %21 = extractelement <16 x float> %A, i32 9
+  %22 = extractelement <16 x float> %B, i32 9
+  %add5 = fadd float %21, %22
+  %23 = extractelement <16 x float> %A, i32 11
+  %24 = extractelement <16 x float> %B, i32 11
+  %add6 = fadd float %23, %24
+  %25 = extractelement <16 x float> %A, i32 12
+  %26 = extractelement <16 x float> %B, i32 12
+  %sub7 = fsub float %25, %26
+  %27 = extractelement <16 x float> %A, i32 14
+  %28 = extractelement <16 x float> %B, i32 14
+  %sub8 = fsub float %27, %28
+  %29 = extractelement <16 x float> %A, i32 13
+  %30 = extractelement <16 x float> %B, i32 13
+  %add7 = fadd float %29, %30
+  %31 = extractelement <16 x float> %A, i32 15
+  %32 = extractelement <16 x float> %B, i32 15
+  %add8 = fadd float %31, %32
+  %vecinsert1 = insertelement <16 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <16 x float> %vecinsert3, float %sub2, i32 2
+  %vecinsert5 = insertelement <16 x float> %vecinsert4, float %add3, i32 5
+  %vecinsert6 = insertelement <16 x float> %vecinsert5, float %add4, i32 7
+  %vecinsert7 = insertelement <16 x float> %vecinsert6, float %sub3, i32 4
+  %vecinsert8 = insertelement <16 x float> %vecinsert7, float %sub4, i32 6
+  %vecinsert9 = insertelement <16 x float> %vecinsert8, float %add5, i32 9
+  %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add6, i32 11
+  %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub5, i32 8
+  %vecinsert12 = insertelement <16 x float> %vecinsert11, float %sub6, i32 10
+  %vecinsert13 = insertelement <16 x float> %vecinsert12, float %add7, i32 13
+  %vecinsert14 = insertelement <16 x float> %vecinsert13, float %add8, i32 15
+  %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub7, i32 12
+  %vecinsert16 = insertelement <16 x float> %vecinsert15, float %sub8, i32 14
+  ret <16 x float> %vecinsert16
+}
+
+; Test that non-sequential / partial add-sub patterns are still folded.
+
+define <4 x float> @test_addsub_v4f32_shuffle_1302(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_shuffle_1302(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 2
+  %4 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %3, %4
+  %5 = extractelement <4 x float> %A, i32 1
+  %6 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %5, %6
+  %7 = extractelement <4 x float> %A, i32 3
+  %8 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %7, %8
+  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+  ret <4 x float> %vecinsert4
+}
+
+define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float undef, float poison, float poison>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_03(
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[VECINSERT1:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> <float poison, float undef, float undef, float poison>, <4 x i32> <i32 0, i32 5, i32 6, i32 poison>
+; CHECK-NEXT:    [[VECINSERT2:%.*]] = shufflevector <4 x float> [[VECINSERT1]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT2]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 3
+  %4 = extractelement <4 x float> %B, i32 3
+  %add = fadd float %4, %3
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_12(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> <float undef, float poison, float poison, float undef>, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[VECINSERT21]]
+;
+  %1 = extractelement <4 x float> %A, i32 2
+  %2 = extractelement <4 x float> %B, i32 2
+  %sub = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+  ret <4 x float> %vecinsert2
+}
+
+define <4 x float> @test_addsub_v4f32_partial_01(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: @test_addsub_v4f32_partial_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[TMP6]]
+;
+  %1 = extractelement <4 x float> %A, i32 0
+  %2 = extractelement <4 x float> %B, i32 0
+  %sub2 = fsub float %1, %2
+  %3 = extractelement <4 x float> %A, i32 1
+  %4 = extractelement <4 x float> %B, i32 1
+  %add2 = fadd float %3, %4
+  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
+  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+  ret <4 x float> %vecinsert2
+}
+
 define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @PR45015(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 3ee9cf8..ffdff21 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -124,18 +124,14 @@ define void @test_known_trip_count() {
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD3_13]], [[WIDE_LOAD5_13]]
 ; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 416), align 16
 ; CHECK-NEXT:    store <2 x double> [[TMP27]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 432), align 16
-; CHECK-NEXT:    [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @b, i64 464), align 16
-; CHECK-NEXT:    [[WIDE_LOAD4_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
-; CHECK-NEXT:    [[WIDE_LOAD5_14:%.*]] = load <2 x double>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 464), align 16
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD4_14]]
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD3_14]], [[WIDE_LOAD5_14]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 464), align 16
-; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 480), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 480), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 448), align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 448), align 16
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 480), align 16
+; CHECK-NEXT:    store double [[ADD]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 448), align 16
+; CHECK-NEXT:    [[TMP32:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @b, i64 456), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @c, i64 456), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    store double [[ADD_1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 456), align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -143,7 +139,7 @@ entry:
 
 for.cond:
   %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %cmp = icmp slt i32 %i.0, 61
+  %cmp = icmp slt i32 %i.0, 58
   br i1 %cmp, label %for.body, label %exit
 
 for.body:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
index c649f29e..dca864e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops-with-cf.ll
@@ -6,18 +6,18 @@ define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; CHECK-LABEL: @basic(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr align 8 [[B]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr align 4 [[P]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr align 8 [[Q]], <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
index eec8910..13fec64 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/masked-memory-ops.ll
@@ -15,7 +15,7 @@ declare void @llvm.masked.store.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
 define void @PR11210_v8f32_maskstore_maskstore(ptr %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) {
 ; CHECK-LABEL: @PR11210_v8f32_maskstore_maskstore(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr [[PTR:%.*]], i32 1, <8 x i1> [[CMP]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr align 1 [[PTR:%.*]], <8 x i1> [[CMP]])
 ; CHECK-NEXT:    ret void
 ;
   %cmp = icmp sgt <8 x i32> %src, zeroinitializer
@@ -41,7 +41,7 @@ define void @PR11210_v8f32_maskstore_maskstore_raw_mask(ptr %ptr, <8 x float> %x
 define void @PR11210_v8f32_mstore_maskstore(ptr %ptr, <8 x float> %x, <8 x float> %y, <8 x i32> %src) {
 ; CHECK-LABEL: @PR11210_v8f32_mstore_maskstore(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <8 x i32> [[SRC:%.*]], zeroinitializer
-; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr [[PTR:%.*]], i32 1, <8 x i1> [[CMP]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.v8f32.p0(<8 x float> [[Y:%.*]], ptr align 1 [[PTR:%.*]], <8 x i1> [[CMP]])
 ; CHECK-NEXT:    ret void
 ;
   %cmp = icmp sgt <8 x i32> %src, zeroinitializer
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index 9289128..dcfebe3 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -72,10 +72,10 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]]
 ; AVX2-NEXT:    [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]]
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP]], i32 4, <8 x i1> [[TMP16]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP5]], i32 4, <8 x i1> [[TMP17]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP6]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[TMP7]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP]], <8 x i1> [[TMP16]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP5]], <8 x i1> [[TMP17]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP6]], <8 x i1> [[TMP18]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[TMP7]], <8 x i1> [[TMP19]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -86,7 +86,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 2
 ; AVX2-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]]
 ; AVX2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; AVX2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]]
+; AVX2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]], !prof [[PROF3:![0-9]+]]
 ; AVX2:       vec.epilog.ph:
 ; AVX2-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX2-NEXT:    [[N_VEC10:%.*]] = and i64 [[TMP3]], 9223372036854775800
@@ -101,10 +101,10 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[TMP22:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 -12)
 ; AVX2-NEXT:    [[TMP23:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD15]], splat (i32 13)
 ; AVX2-NEXT:    [[TMP24:%.*]] = or <8 x i1> [[TMP22]], [[TMP23]]
-; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr [[NEXT_GEP14]], i32 4, <8 x i1> [[TMP24]])
+; AVX2-NEXT:    tail call void @llvm.masked.store.v8i32.p0(<8 x i32> splat (i32 42), ptr align 4 [[NEXT_GEP14]], <8 x i1> [[TMP24]])
 ; AVX2-NEXT:    [[INDEX_NEXT16]] = add nuw i64 [[INDEX12]], 8
 ; AVX2-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC10]]
-; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX2-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[BB12]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX2:       vec.epilog.middle.block:
 ; AVX2-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
 ; AVX2-NEXT:    br i1 [[CMP_N17]], label [[EXIT]], label [[BB12_PREHEADER1]]
@@ -124,7 +124,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2:       latch:
 ; AVX2-NEXT:    [[PTR_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR2]], i64 4
 ; AVX2-NEXT:    [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]]
-; AVX2-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX2-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB13]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX2:       exit:
 ; AVX2-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
index ac73651..d980369 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll
@@ -30,11 +30,11 @@ define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %f
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[Y]], <4 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD12]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[X]], <4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> [[TMP6]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = tail call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> align 8 [[TMP6]], <4 x i1> splat (i1 true), <4 x double> poison), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast olt <4 x double> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER13]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x double> [[WIDE_MASKED_GATHER13]], <4 x double> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> [[TMP4]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
+; CHECK-NEXT:    tail call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> [[TMP8]], <4 x ptr> align 8 [[TMP4]], <4 x i1> splat (i1 true)), !tbaa [[DOUBLE_TBAA5]], !llvm.access.group [[ACC_GRP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV_EPIL]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[UNROLL_ITER]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
index 23096ec..93138f5 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-gather-scatter.ll
@@ -8,7 +8,7 @@ define <4 x i32> @vpgather_v4i32(<4 x ptr> %ptrs, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]], <4 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[PTRS:%.*]], <4 x i1> [[TMP2]], <4 x i32> poison)
 ; CHECK-NEXT:    ret <4 x i32> [[V1]]
 ;
   %v = call <4 x i32> @llvm.vp.gather.v4i32.v4p0(<4 x ptr> %ptrs, <4 x i1> %m, i32 %evl)
@@ -21,7 +21,7 @@ define <2 x i64> @vpgather_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[V1]]
 ;
   %v = call <2 x i64> @llvm.vp.gather.v2i64.v2p0(<2 x ptr> %ptrs, <2 x i1> %m, i32 %evl)
@@ -34,7 +34,7 @@ define void @vpscatter_v4i32(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 z
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> [[PTRS:%.*]], i32 4, <4 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL:%.*]], <4 x ptr> align 4 [[PTRS:%.*]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %m, i32 %evl)
@@ -47,7 +47,7 @@ define void @vpscatter_v2i64(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 z
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[VAL:%.*]], <2 x ptr> align 8 [[PTRS:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.v2i64.v2p0(<2 x i64> %val, <2 x ptr> %ptrs, <2 x i1> %m, i32 %evl)
@@ -61,7 +61,7 @@ define <vscale x 2 x i32> @vpgather_nxv2i32(<vscale x 2 x ptr> %ptrs, <vscale x
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
-; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]], <vscale x 2 x i32> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[PTRS:%.*]], <vscale x 2 x i1> [[TMP2]], <vscale x 2 x i32> poison)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[V1]]
 ;
   %v = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
@@ -74,7 +74,7 @@ define <vscale x 1 x i64> @vpgather_nxv1i64(<vscale x 1 x ptr> %ptrs, <vscale x
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[V1:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> align 8 [[PTRS:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[V1]]
 ;
   %v = call <vscale x 1 x i64> @llvm.vp.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
@@ -87,7 +87,7 @@ define void @vpscatter_nxv2i32(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 2 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[VAL:%.*]], <vscale x 2 x ptr> [[PTRS:%.*]], i32 4, <vscale x 2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> [[VAL:%.*]], <vscale x 2 x ptr> align 4 [[PTRS:%.*]], <vscale x 2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> %val, <vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %m, i32 %evl)
@@ -100,7 +100,7 @@ define void @vpscatter_nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x ptr> [[PTRS:%.*]], i32 8, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x ptr> align 8 [[PTRS:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> %val, <vscale x 1 x ptr> %ptrs, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
index 8192364..bd2726e 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/expand-vp-load-store.ll
@@ -9,7 +9,7 @@ define <2 x i64> @vpload_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 %evl)
@@ -18,7 +18,7 @@ define <2 x i64> @vpload_v2i64(ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
 
 define <2 x i64> @vpload_v2i64_vlmax(ptr %ptr, <2 x i1> %m) {
 ; CHECK-LABEL: @vpload_v2i64_vlmax(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[M:%.*]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> %m, i32 2)
@@ -31,7 +31,7 @@ define <2 x i64> @vpload_v2i64_allones_mask(ptr %ptr, i32 zeroext %evl) {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %load = call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
@@ -53,7 +53,7 @@ define void @vpstore_v2i64(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 %evl)
@@ -62,7 +62,7 @@ define void @vpstore_v2i64(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %e
 
 define void @vpstore_v2i64_vlmax(<2 x i64> %val, ptr %ptr, <2 x i1> %m) {
 ; CHECK-LABEL: @vpstore_v2i64_vlmax(
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[M:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 2)
@@ -75,7 +75,7 @@ define void @vpstore_v2i64_allones_mask(<2 x i64> %val, ptr %ptr, i32 zeroext %e
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], splat (i1 true)
-; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.v2i64.p0(<2 x i64> %val, ptr %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
@@ -98,7 +98,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64(ptr %ptr, <vscale x 1 x i1> %m, i32 ze
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
   %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
@@ -109,7 +109,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_vscale(ptr %ptr, <vscale x 1 x i1> %m)
 ; CHECK-LABEL: @vpload_nxv1i64_vscale(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP1]]
 ;
   %vscale = call i32 @llvm.vscale.i32()
@@ -124,7 +124,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %ev
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
   %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -150,7 +150,7 @@ define void @vpstore_nxv1i64(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 %evl)
@@ -161,7 +161,7 @@ define void @vpstore_nxv1i64_vscale(<vscale x 1 x i64> %val, ptr %ptr, <vscale x
 ; CHECK-LABEL: @vpstore_nxv1i64_vscale(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[M:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   %vscale = call i32 @llvm.vscale.i32()
@@ -176,7 +176,7 @@ define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, ptr %ptr, i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], splat (i1 true)
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
-; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr align 1 [[PTR:%.*]], <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll b/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
index caf4d52..c747a4d 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/vector-nonlive-clobber.ll
@@ -16,7 +16,7 @@ define void @test_vector_clobber(ptr addrspace(1) %ptr) gc "statepoint-example"
 ; CHECK-NEXT:    [[GEP_RELOCATED:%.*]] = call coldcc <8 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v8p1(token [[STATEPOINT_TOKEN1]], i32 2, i32 0)
 ; CHECK-NEXT:    [[PTR_RELOCATED2:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN1]], i32 1, i32 1)
 ; CHECK-NEXT:    [[DOTSPLAT_BASE_RELOCATED:%.*]] = call coldcc <8 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v8p1(token [[STATEPOINT_TOKEN1]], i32 2, i32 2)
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p1(<8 x ptr addrspace(1)> [[GEP_RELOCATED]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison)
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p1(<8 x ptr addrspace(1)> align 4 [[GEP_RELOCATED]], <8 x i1> splat (i1 true), <8 x float> poison)
 ; CHECK-NEXT:    unreachable
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/constant-range-struct.ll b/llvm/test/Transforms/SCCP/constant-range-struct.ll
index 7a399df..0f45b38 100644
--- a/llvm/test/Transforms/SCCP/constant-range-struct.ll
+++ b/llvm/test/Transforms/SCCP/constant-range-struct.ll
@@ -25,7 +25,7 @@ true:
   br label %exit
 
 false:
-  %s.3 = insertvalue {i64, i64} undef, i64 30, 0
+  %s.3 = insertvalue {i64, i64} poison, i64 30, 0
   %s.4 = insertvalue {i64, i64} %s.3, i64 300, 1
   br label %exit
 
@@ -39,14 +39,14 @@ define void @struct1_caller() {
 ; CHECK-NEXT:    [[S:%.*]] = call { i64, i64 } @struct1()
 ; CHECK-NEXT:    [[V1:%.*]] = extractvalue { i64, i64 } [[S]], 0
 ; CHECK-NEXT:    [[V2:%.*]] = extractvalue { i64, i64 } [[S]], 1
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ne i64 [[V1]], 10
-; CHECK-NEXT:    call void @use(i1 [[T_1]])
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i64 [[V1]], 100
-; CHECK-NEXT:    call void @use(i1 [[T_2]])
-; CHECK-NEXT:    [[T_3:%.*]] = icmp ne i64 [[V2]], 0
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_3:%.*]] = icmp eq i64 [[V1]], 20
 ; CHECK-NEXT:    call void @use(i1 [[T_3]])
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ult i64 [[V2]], 301
-; CHECK-NEXT:    call void @use(i1 [[T_4]])
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_6:%.*]] = icmp eq i64 [[V2]], 300
+; CHECK-NEXT:    call void @use(i1 [[T_6]])
 ; CHECK-NEXT:    ret void
 ;
   %s = call {i64, i64} @struct1()
@@ -57,10 +57,14 @@ define void @struct1_caller() {
   call void @use(i1 %t.1)
   %t.2 = icmp ult i64 %v1, 100
   call void @use(i1 %t.2)
-  %t.3 = icmp ne i64 %v2, 0
+  %t.3 = icmp eq i64 %v1, 20
   call void @use(i1 %t.3)
-  %t.4 = icmp ult i64 %v2, 301
+  %t.4 = icmp ne i64 %v2, 0
   call void @use(i1 %t.4)
+  %t.5 = icmp ult i64 %v2, 301
+  call void @use(i1 %t.5)
+  %t.6 = icmp eq i64 %v2, 300
+  call void @use(i1 %t.6)
 
   ret void
 }
@@ -76,7 +80,7 @@ define internal {i64, i64} @struct2() {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[V1:%.*]] = phi i64 [ 20, [[TRUE]] ], [ 30, [[FALSE]] ]
 ; CHECK-NEXT:    [[V2:%.*]] = phi i64 [ 200, [[TRUE]] ], [ 300, [[FALSE]] ]
-; CHECK-NEXT:    [[S_1:%.*]] = insertvalue { i64, i64 } undef, i64 [[V1]], 0
+; CHECK-NEXT:    [[S_1:%.*]] = insertvalue { i64, i64 } poison, i64 [[V1]], 0
 ; CHECK-NEXT:    [[S_2:%.*]] = insertvalue { i64, i64 } [[S_1]], i64 [[V2]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[S_2]]
 ;
@@ -92,7 +96,7 @@ false:
 exit:
   %v1 = phi i64 [ 20, %true ], [ 30, %false ]
   %v2 = phi i64 [ 200, %true ], [ 300, %false ]
-  %s.1 = insertvalue {i64, i64} undef, i64 %v1, 0
+  %s.1 = insertvalue {i64, i64} poison, i64 %v1, 0
   %s.2 = insertvalue {i64, i64} %s.1, i64 %v2, 1
   ret {i64, i64} %s.2
 }
@@ -153,3 +157,40 @@ define void @struct2_caller() {
 
   ret void
 }
+
+%"phi_type" =  type {i64, i64}
+
+define internal %"phi_type" @test(i32 %input) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    br label [[COND_TRUE_I:%.*]]
+; CHECK:       cond.true.i:
+; CHECK-NEXT:    br label [[COND_END_I:%.*]]
+; CHECK:       cond.end.i:
+; CHECK-NEXT:    ret [[PHI_TYPE:%.*]] poison
+;
+  %cmp.cond = icmp eq i32 %input, 1
+  br i1 %cmp.cond, label %cond.true.i, label %cond.false.i
+
+cond.true.i:
+  %r1.tmp = insertvalue %"phi_type" poison, i64 1, 0
+  %r1.tmp.2 = insertvalue %"phi_type" %r1.tmp, i64 2, 1
+  br label %cond.end.i
+
+cond.false.i:
+  %r2.tmp = insertvalue %"phi_type" poison, i64 3, 0
+  %r2.tmp.2 = insertvalue %"phi_type" %r2.tmp, i64 4, 1
+  br label %cond.end.i
+
+cond.end.i:
+  %retval = phi %"phi_type" [ %r1.tmp.2, %cond.true.i ], [ %r2.tmp.2, %cond.false.i ]
+  ret %"phi_type" %retval
+}
+
+define %"phi_type" @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CALL_1:%.*]] = tail call fastcc [[PHI_TYPE:%.*]] @[[TEST:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](i32 noundef 1)
+; CHECK-NEXT:    ret [[PHI_TYPE]] { i64 1, i64 2 }
+;
+  %call.1 = tail call fastcc noundef %"phi_type" @test(i32 noundef 1)
+  ret %"phi_type" %call.1
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
new file mode 100644
index 0000000..d16843c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div-like-mixed-with-undefs.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define ptr @test(ptr %d) {
+; CHECK-LABEL: define ptr @test(
+; CHECK-SAME: ptr [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr null, align 1
+; CHECK-NEXT:    [[CMP4_2:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[CMP4_2]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 0, 0
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 1, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <6 x i64> poison, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <6 x i64> [[TMP5]], i64 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <6 x i64> [[TMP6]], i64 [[TMP4]], i32 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <6 x i64> [[TMP7]], <6 x i64> poison, <6 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <6 x i64> [[TMP8]], <i64 2, i64 6, i64 1, i64 1, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <6 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <6 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <6 x i64> [[TMP9]], i32 2
+; CHECK-NEXT:    [[SCEVGEP42:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <6 x i64> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <6 x i64> [[TMP9]], i32 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <6 x i64> [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP19]]
+; CHECK-NEXT:    ret ptr [[TMP20]]
+;
+entry:
+  %0 = load i8, ptr null, align 1
+  %cmp4.2 = icmp eq i8 %0, 0
+  %1 = select i1 %cmp4.2, i64 0, i64 0
+  %2 = shl i64 %1, 1
+  %3 = getelementptr i8, ptr %d, i64 %2
+  %4 = xor i64 0, 0
+  %5 = udiv i64 %4, 0
+  %6 = mul i64 %5, 6
+  %7 = getelementptr i8, ptr %d, i64 %6
+  %8 = shl i64 %1, 0
+  %scevgep42 = getelementptr i8, ptr %d, i64 %8
+  %9 = mul i64 %5, 1
+  %10 = getelementptr i8, ptr %d, i64 %9
+  %11 = udiv i64 1, 0
+  %12 = mul i64 %11, 1
+  %13 = getelementptr i8, ptr %d, i64 %12
+  %14 = mul i64 %11, 0
+  %15 = getelementptr i8, ptr %d, i64 %14
+  ret ptr %15
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
index 2bde202..9b34469 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalable-vector.ll
@@ -8,10 +8,10 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
-; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[LOAD0:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[LOAD1:%.*]] = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 undef, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
 ; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 16 x i8> [[LOAD1]], [[LOAD0]]
-; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[ADD]], ptr undef, i32 1, <vscale x 16 x i1> undef)
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[ADD]], ptr align 1 undef, <vscale x 16 x i1> undef)
 ; CHECK-NEXT:    ret void
 ;
   %load0 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> undef, <vscale x 16 x i8> undef)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 02e05b2..f8229b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -170,7 +170,7 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr align 1 [[GEP_L0]], <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <31 x i8> [[TMP2]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
@@ -251,7 +251,7 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr align 1 [[GEP_L0]], <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 2, i32 0, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
@@ -532,7 +532,7 @@ define void @constant_stride_masked_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr align 1 [[GEP_L0]], <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
 ; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
index 82c9403..3a7112f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll
@@ -13,7 +13,7 @@ define i16 @test(ptr %i) {
 ; CHECK-NEXT:    br label %[[FOR_COND5_US:.*]]
 ; CHECK:       [[FOR_COND5_US]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i16> @llvm.experimental.vp.strided.load.v4i16.p0.i64(ptr align 2 [[GEP_US154_2]], i64 4914, <4 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP3]], i32 2, <4 x i1> splat (i1 true), <4 x i16> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 [[TMP3]], <4 x i1> splat (i1 true), <4 x i16> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i16> [[TMP5]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
index 8e80aee..5697292 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-node-with-no-users.ll
@@ -9,8 +9,8 @@ define void @test(ptr %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 222, i64 228, i64 276, i64 279, i64 282, i64 285, i64 288, i64 0>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 345, i64 348, i64 351, i64 354, i64 357, i64 360, i64 363>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP3]], i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 [[TMP2]], <8 x i1> splat (i1 true), <8 x i8> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> align 1 [[TMP3]], <8 x i1> splat (i1 true), <8 x i8> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label %[[FOR_COND:.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
index bf6e2bd..7fd88e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
@@ -17,7 +17,7 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> splat (i1 true), <2 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> align 4 zeroinitializer, <2 x i1> splat (i1 true), <2 x float> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x float> [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP6]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index da08718..e13dfce 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -7,7 +7,7 @@ define i32 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[IF_END_I87:%.*]]
 ; CHECK:       if.end.i87:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 getelementptr (i32, <4 x ptr> <ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64036 to ptr), ptr inttoptr (i64 64064 to ptr), ptr inttoptr (i64 64064 to ptr)>, <4 x i64> <i64 0, i64 1, i64 0, i64 1>), <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> <i32 undef, i32 undef, i32 0, i32 0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    switch i32 0, label [[SW_BB509_I:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[SW_BB509_I]]
@@ -147,8 +147,8 @@ define ptr @test4() {
 ; POWEROF2-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; POWEROF2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; POWEROF2-NEXT:    [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP16]]
+; POWEROF2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
+; POWEROF2-NEXT:    [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP30]]
 ; POWEROF2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
 ; POWEROF2-NEXT:    [[TMP19:%.*]] = fmul float [[TMP18]], 0.000000e+00
 ; POWEROF2-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index ff89718..a0930a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -13,7 +13,7 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], <i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
 ; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP6]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = fsub <2 x double> [[TMP8]], [[TMP9]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 8aa7529..16423e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -9,7 +9,7 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> splat (i1 true), <2 x i16> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> align 2 [[TMP2]], <2 x i1> splat (i1 true), <2 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
index 65bf24c..8497db2 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
@@ -5,7 +5,7 @@ define i32 @sum_of_abs_stride_2(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_2
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i8> @llvm.masked.load.v15i8.p0(ptr [[A]], i32 1, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i8> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i8> @llvm.masked.load.v15i8.p0(ptr align 1 [[A]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <15 x i8> [[TMP0]], <15 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
@@ -58,7 +58,7 @@ define i32 @sum_of_abs_stride_3(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_3
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <22 x i8> @llvm.masked.load.v22i8.p0(ptr [[A]], i32 1, <22 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i8> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <22 x i8> @llvm.masked.load.v22i8.p0(ptr align 1 [[A]], <22 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <22 x i8> [[TMP0]], <22 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP1]], i1 false)
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
index a079203..2b9891d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK-NEXT:    [[SUB4_I_I65_US:%.*]] = or i64 0, 1
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> align 4 getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), <2 x i1> splat (i1 true), <2 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
index 07094c6..8fddb2e 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll
@@ -15,7 +15,7 @@ define i16 @test() {
 ; CHECK-NEXT:    [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ]
 ; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1
 ; CHECK-NEXT:    [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1
-; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr align 2 [[PPREV_0_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
index 510cf45..8be8e96 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll
@@ -14,11 +14,11 @@ define void @test(ptr %mdct_forward_x) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> <i64 28, i64 36, i64 24, i64 28>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 4 [[ADD_PTR_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr align 4 [[ARRAYIDX5_I_I]], <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x float> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> <i32 2, i32 0, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
index 80ba7a4..dfd2c4a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll
@@ -9,7 +9,7 @@ define void @test() {
 ; CHECK-NEXT:    [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr align 8 [[COND_IN_V]], <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret void
@@ -20,7 +20,7 @@ define void @test() {
 ; CHECK-SLP-THRESHOLD-NEXT:    [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null
 ; CHECK-SLP-THRESHOLD-NEXT:    br label [[BB:%.*]]
 ; CHECK-SLP-THRESHOLD:       bb:
-; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
+; CHECK-SLP-THRESHOLD-NEXT:    [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr align 8 [[COND_IN_V]], <13 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <13 x i64> poison)
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
 ; CHECK-SLP-THRESHOLD-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
 ; CHECK-SLP-THRESHOLD-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll
new file mode 100644
index 0000000..867464e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/num-uses-for-copyable-elements.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-apple-macosx15.0.0  -mcpu=skylake-avx512 -S < %s | FileCheck %s
+
+define void @test(ptr %output) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ARRAYIDX_2_I:%.*]] = getelementptr i8, ptr [[OUTPUT]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[OUTPUT]], <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> <i32 -1, i32 0>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> splat (i32 2))
+; CHECK-NEXT:    store <2 x i32> [[TMP4]], ptr [[ARRAYIDX_2_I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx.2.i = getelementptr i8, ptr %output, i64 8
+  %0 = load i32, ptr %output, align 4
+  %arrayidx.3.i = getelementptr i8, ptr %output, i64 12
+  %1 = load i32, ptr %arrayidx.3.i, align 4
+  %xor7 = xor i32 -1, %0
+  %or.i = tail call i32 @llvm.fshl.i32(i32 %xor7, i32 0, i32 2)
+  %or.i11 = tail call i32 @llvm.fshl.i32(i32 %1, i32 %1, i32 2)
+  store i32 %or.i, ptr %arrayidx.2.i, align 4
+  store i32 %or.i11, ptr %arrayidx.3.i, align 4
+  ret void
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
index 57eb1e7..1fe1217d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-operand-gathered-loads.ll
@@ -19,7 +19,7 @@ define void @test(ptr %this, i1 %cmp4.not) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEWPT]], i64 92
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> align 4 [[TMP3]], <4 x i1> splat (i1 true), <4 x i32> poison)
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index fde76f8..8b58d0c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -188,7 +188,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-LABEL: define void @gather_load_2(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -198,7 +198,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512VL-LABEL: define void @gather_load_2(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -302,7 +302,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: define void @gather_load_3(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -311,7 +311,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512F-LABEL: define void @gather_load_3(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -320,7 +320,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512VL-LABEL: define void @gather_load_3(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -442,7 +442,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX2-LABEL: define void @gather_load_4(
 ; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -451,7 +451,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512F-LABEL: define void @gather_load_4(
 ; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -460,7 +460,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512VL-LABEL: define void @gather_load_4(
 ; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -666,7 +666,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: define void @gather_load_div(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
@@ -677,7 +677,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: define void @gather_load_div(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index cf380f0..2d6f007 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -188,7 +188,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-LABEL: define void @gather_load_2(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -198,7 +198,7 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512VL-LABEL: define void @gather_load_2(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr align 4 [[TMP3]], <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -302,7 +302,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: define void @gather_load_3(
 ; AVX2-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -311,7 +311,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512F-LABEL: define void @gather_load_3(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -320,7 +320,7 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX512VL-LABEL: define void @gather_load_3(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[TMP1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -442,7 +442,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX2-LABEL: define void @gather_load_4(
 ; AVX2-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX2-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -451,7 +451,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512F-LABEL: define void @gather_load_4(
 ; AVX512F-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512F-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -460,7 +460,7 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ;
 ; AVX512VL-LABEL: define void @gather_load_4(
 ; AVX512VL-SAME: ptr noalias captures(none) [[T0:%.*]], ptr noalias readonly captures(none) [[T1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr align 4 [[T1]], <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
@@ -666,7 +666,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512F-LABEL: define void @gather_load_div(
 ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
@@ -677,7 +677,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: define void @gather_load_div(
 ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0]] {
-; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr align 4 [[TMP1]], <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[SHORT_TBAA0]]
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index f921278..885e28d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -14,7 +14,7 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
 ; CHECK-NEXT:    [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
-; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr align 8 [[GEP1_0]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
@@ -29,7 +29,7 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
 ; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
 ; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> align 8 [[P]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index f0272d5..a907a1f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -11,7 +11,7 @@ define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %ar
 ; CHECK-NEXT:    [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
 ; CHECK-NEXT:    [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> align 8 [[P]], <2 x i1> splat (i1 true))
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 3fd9e12..23a901f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -15,7 +15,7 @@
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr [[P:%.*]], i32 4, <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr align 4 [[P:%.*]], <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <35 x i32> [[TMP0]], <35 x i32> poison, <4 x i32> <i32 0, i32 32, i32 33, i32 34>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index bbb1b87..cadf038 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -6,12 +6,12 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[OFF0_1]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP6]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
 ; CHECK-NEXT:    ret i32 [[TMP8]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index 19ce11c..723537b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -71,7 +71,7 @@ define void @test1() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -134,7 +134,7 @@ define void @test_div() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
@@ -195,7 +195,7 @@ define void @test_rem() {
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr align 4 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> <i32 0, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index 7bb436b..71bda89 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -4,7 +4,7 @@
 define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr align 16 [[P:%.*]], <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 63dbf3c..e974121 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -8,7 +8,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) align 4 [[TMP3]], <6 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>, <6 x float> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> <i32 0, i32 1, i32 5, i32 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> <i32 0, i32 4, i32 0, i32 4, i32 5, i32 1, i32 5, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
index 75a866f..42271fa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-load-compress.ll
@@ -6,7 +6,7 @@ define void @test(ptr %in) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[IN]], i64 64
-; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr [[TMP1]], i32 2, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.masked.load.v32i16.p0(ptr align 2 [[TMP1]], <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i16> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i16> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP0]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
index 9d63c0f..94172cf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
@@ -6,7 +6,7 @@ define double @test01() {
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr null, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x ptr> zeroinitializer, <2 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP2]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP2]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
index d487e36..77084f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll
@@ -5,11 +5,11 @@ define void @test(i1 %c, ptr %arg) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 8 [[ARG:%.*]], <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr align 8 [[ARG]], <5 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true>, <5 x i64> poison)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 4>
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 8dc8db9..5a66f37 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -9,7 +9,7 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <12 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[ARRAYIDX20]], <12 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <12 x i32> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> <i32 11, i32 3, i32 2, i32 0, i32 8, i32 9, i32 10, i32 1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]]
 ; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
index fdc0bc0..771df95 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
@@ -5,16 +5,16 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[ADDR]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP5]], <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr align 8 [[GEP2]], <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> align 4 [[TMP8]], <8 x i1> splat (i1 true), <8 x i32> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]]
 ; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[S:%.*]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
index 51425c6..c0055af 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-load.ll
@@ -29,7 +29,7 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[RES_PHI_ELSE3]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 128 [[P:%.*]], <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
@@ -66,7 +66,7 @@ define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[TMP1]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> splat (i1 true), <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> splat (i1 true), <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
@@ -82,7 +82,7 @@ define <2 x i64> @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[PASSTHRU:%.*]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
@@ -100,7 +100,7 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
 ; CHECK-LE-NEXT:    ret <2 x i64> [[TMP3]]
 ;
 ; CHECK-LE-SVE-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
+; CHECK-LE-SVE-NEXT:    [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 8 [[P:%.*]], <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
 ; CHECK-LE-SVE-NEXT:    ret <2 x i64> [[RET]]
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
index 0acc551..a7f2ac5 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/expand-masked-store.ll
@@ -27,7 +27,7 @@ define void @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]])
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 128 [[P:%.*]], <2 x i1> [[MASK:%.*]])
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64(
@@ -62,7 +62,7 @@ define void @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_ones_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> splat (i1 true))
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> splat (i1 true))
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
@@ -78,7 +78,7 @@ define void @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_zero_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer)
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> zeroinitializer)
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
@@ -96,7 +96,7 @@ define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) {
 ; CHECK-LE-NEXT:    ret void
 ;
 ; CHECK-SVE-LE-LABEL: @scalarize_v2i64_const_mask(
-; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>)
+; CHECK-SVE-LE-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[DATA:%.*]], ptr align 8 [[P:%.*]], <2 x i1> <i1 false, i1 true>)
 ; CHECK-SVE-LE-NEXT:    ret void
 ;
 ; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 1008066..4eda91ff 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -9,18 +9,18 @@ define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; LOADSTORE-LABEL: @basic(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; LOADSTORE-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; LOADSTORE-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i64> poison)
 ; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
 ; LOADSTORE-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr align 2 [[B]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr align 4 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr align 8 [[Q]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @basic(
@@ -63,10 +63,10 @@ define void @succ1to0(ptr %p, ptr %q, i32 %a) {
 ; LOADSTORE-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP1]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @succ1to0(
@@ -100,7 +100,7 @@ define i32 @succ1to0_phi(ptr %p)  {
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
@@ -120,7 +120,7 @@ define i32 @succ1to0_phi(ptr %p)  {
 ; LOADONLY-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADONLY-NEXT:    ret i32 [[TMP3]]
 ;
@@ -154,10 +154,10 @@ define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[B:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; LOADSTORE-NEXT:    ret void
 ;
@@ -196,8 +196,8 @@ define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[B:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
 ; LOADSTORE-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
@@ -238,7 +238,7 @@ define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
 ; LOADSTORE-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; LOADSTORE-NEXT:    ret void
 ;
@@ -248,7 +248,7 @@ define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
 ; STOREONLY-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
 ; STOREONLY-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
 ; STOREONLY-NEXT:    ret void
 ;
@@ -301,7 +301,7 @@ define i32 @load_from_gep(ptr %p)  {
 ; LOADSTORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[ARRAYIDX]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
@@ -323,7 +323,7 @@ define i32 @load_from_gep(ptr %p)  {
 ; LOADONLY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADONLY-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[ARRAYIDX]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADONLY-NEXT:    ret i32 [[TMP3]]
 ;
@@ -358,14 +358,14 @@ define void @nondebug_metadata(i1 %cond, ptr %p, ptr %q) {
 ; LOADSTORE-LABEL: @nondebug_metadata(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call range(i16 0, 10) <1 x i16> @llvm.masked.load.v1i16.p0(ptr align 2 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i16> poison)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q:%.*]], <1 x i1> [[TMP0]], <1 x i32> poison), !annotation [[META5:![0-9]+]]
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr [[Q]], i32 4, <1 x i1> [[TMP0]]), !annotation [[META5]]
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP5]], ptr align 4 [[Q]], <1 x i1> [[TMP0]]), !annotation [[META5]]
 ; LOADSTORE-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[P]], i32 2, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr align 2 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; NONE-LABEL: @nondebug_metadata(
@@ -398,14 +398,14 @@ define i16 @debug_metadata_diassign(i1 %cond, i16 %a, ptr %p) {
 ; LOADSTORE-LABEL: @debug_metadata_diassign(
 ; LOADSTORE-NEXT:  bb0:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
 ; LOADSTORE-NEXT:    ret i16 [[SPEC_SELECT]]
 ;
 ; STOREONLY-LABEL: @debug_metadata_diassign(
 ; STOREONLY-NEXT:  bb0:
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> splat (i16 7), ptr align 4 [[P:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i16 3, i16 2
 ; STOREONLY-NEXT:    ret i16 [[SPEC_SELECT]]
 ;
@@ -451,7 +451,7 @@ define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
 ; LOADSTORE-NEXT:    store i1 false, ptr [[P:%.*]], align 2
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr align 8 [[P]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
 ; LOADSTORE-NEXT:    ret i32 0
 ;
@@ -460,7 +460,7 @@ define i32 @hoist_cond_stores(i1 %cond, ptr %p) {
 ; STOREONLY-NEXT:    store i1 false, ptr [[P:%.*]], align 2
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
 ; STOREONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[COND]], i1 false, i1 false
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[P]], i32 8, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr align 8 [[P]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    store i1 [[SPEC_STORE_SELECT]], ptr [[P]], align 2
 ; STOREONLY-NEXT:    ret i32 0
 ;
@@ -507,10 +507,10 @@ define i32 @single_predecessor(ptr %p, ptr %q, i32 %a) {
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[Q:%.*]], <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[Q]], <1 x i1> [[TMP1]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr align 4 [[P:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    [[DOT:%.*]] = select i1 [[TOBOOL]], i32 2, i32 3
 ; LOADSTORE-NEXT:    ret i32 [[DOT]]
 ;
@@ -548,23 +548,23 @@ define void @threshold_6(i1 %cond, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5,
 ; LOADSTORE-LABEL: @threshold_6(
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[P1:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr align 4 [[P2:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr align 4 [[P3:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr align 4 [[P4:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr align 4 [[P5:%.*]], <1 x i1> [[TMP0]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr align 4 [[P6:%.*]], <1 x i1> [[TMP0]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; STOREONLY-LABEL: @threshold_6(
 ; STOREONLY-NEXT:  entry:
 ; STOREONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr [[P1:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr [[P3:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr [[P4:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr [[P5:%.*]], i32 4, <1 x i1> [[TMP0]])
-; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr [[P6:%.*]], i32 4, <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 1), ptr align 4 [[P1:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 2), ptr align 4 [[P2:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 3), ptr align 4 [[P3:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 4), ptr align 4 [[P4:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 5), ptr align 4 [[P5:%.*]], <1 x i1> [[TMP0]])
+; STOREONLY-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> splat (i32 6), ptr align 4 [[P6:%.*]], <1 x i1> [[TMP0]])
 ; STOREONLY-NEXT:    ret void
 ;
 ; LOADONLY-LABEL: @threshold_6(
@@ -987,7 +987,7 @@ define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) {
 ; LOADSTORE-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; LOADSTORE:       bb1:
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
 ; LOADSTORE-NEXT:    br label [[BB3]]
 ; LOADSTORE:       bb3:
@@ -1013,7 +1013,7 @@ define i32 @succ_phi_has_3input(i1 %cond1, ptr %p, i1 %cond2) {
 ; LOADONLY-NEXT:    br i1 [[COND1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
 ; LOADONLY:       bb1:
 ; LOADONLY-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND2:%.*]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[P:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
+; LOADONLY-NEXT:    [[TMP1:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr align 8 [[P:%.*]], <1 x i1> [[TMP0]], <1 x i64> zeroinitializer)
 ; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
 ; LOADONLY-NEXT:    br label [[BB3]]
 ; LOADONLY:       bb3:
@@ -1056,10 +1056,10 @@ define i32 @succ1to0_phi2(ptr %p, ptr %p2) {
 ; LOADSTORE-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr align 4 [[P2:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    ret i32 [[TMP3]]
 ;
 ; NONE-LABEL: @succ1to0_phi2(
@@ -1095,10 +1095,10 @@ define i32 @succ1to0_phi3(ptr %p, ptr %p2, i32 %x) {
 ; LOADSTORE-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
 ; LOADSTORE-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X:%.*]] to <1 x i32>
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[P]], <1 x i1> [[TMP1]], <1 x i32> zeroinitializer)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P2:%.*]], i32 4, <1 x i1> [[TMP1]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr align 4 [[P2:%.*]], <1 x i1> [[TMP1]])
 ; LOADSTORE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 [[X]], i32 [[TMP4]]
 ; LOADSTORE-NEXT:    [[RES:%.*]] = add i32 [[TMP4]], [[SPEC_SELECT]]
 ; LOADSTORE-NEXT:    ret i32 [[RES]]
@@ -1177,10 +1177,10 @@ define void @hoist_store_without_cstore(ptr %0, ptr %1, i1 %cmp) {
 ; LOADSTORE-NEXT:  entry:
 ; LOADSTORE-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
 ; LOADSTORE-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
-; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADSTORE-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[TMP0:%.*]], <1 x i1> [[TMP2]], <1 x i32> poison)
 ; LOADSTORE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADSTORE-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[TMP1]], i32 8, <1 x i1> [[TMP2]])
+; LOADSTORE-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr align 8 [[TMP1]], <1 x i1> [[TMP2]])
 ; LOADSTORE-NEXT:    ret void
 ;
 ; STOREONLY-LABEL: @hoist_store_without_cstore(
@@ -1198,7 +1198,7 @@ define void @hoist_store_without_cstore(ptr %0, ptr %1, i1 %cmp) {
 ; LOADONLY-NEXT:  entry:
 ; LOADONLY-NEXT:    store i32 0, ptr [[TMP1:%.*]], align 8
 ; LOADONLY-NEXT:    [[TMP2:%.*]] = bitcast i1 [[CMP:%.*]] to <1 x i1>
-; LOADONLY-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP0:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; LOADONLY-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr align 4 [[TMP0:%.*]], <1 x i1> [[TMP2]], <1 x i32> poison)
 ; LOADONLY-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
 ; LOADONLY-NEXT:    [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP4]], i32 0
 ; LOADONLY-NEXT:    store i32 [[SPEC_STORE_SELECT]], ptr [[TMP1]], align 8
diff --git a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
index c966bed..c7500cf 100644
--- a/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/spec-casts.ll
@@ -1,147 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=speculative-execution \
 ; RUN:   -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \
 ; RUN:   | FileCheck %s
 
-; CHECK-LABEL: @ifThen_bitcast(
-; CHECK: bitcast
-; CHECK: br i1 true
-define void @ifThen_bitcast() {
+define void @ifThen_bitcast(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_bitcast(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = bitcast i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = bitcast i32 undef to float
+  %x = bitcast i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_ptrtoint(
-; CHECK: ptrtoint
-; CHECK: br i1 true
-define void @ifThen_ptrtoint() {
+define void @ifThen_ptrtoint(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoint(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = ptrtoint ptr undef to i64
+  %x = ptrtoint ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_inttoptr(
-; CHECK: inttoptr
-; CHECK: br i1 true
-define void @ifThen_inttoptr() {
+define void @ifThen_ptrtoaddr(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_ptrtoaddr(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = ptrtoaddr ptr [[ARG]] to i64
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 
 a:
-  %x = inttoptr i64 undef to ptr
+  %x = ptrtoaddr ptr %arg to i64
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_addrspacecast(
-; CHECK: addrspacecast
-; CHECK: br i1 true
-define void @ifThen_addrspacecast() {
+define void @ifThen_inttoptr(i64 %arg) {
+; CHECK-LABEL: define void @ifThen_inttoptr(
+; CHECK-SAME: i64 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = inttoptr i64 [[ARG]] to ptr
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 true, label %a, label %b
+
+a:
+  %x = inttoptr i64 %arg to ptr
+  br label %b
+
+b:
+  ret void
+}
+
+define void @ifThen_addrspacecast(ptr %arg) {
+; CHECK-LABEL: define void @ifThen_addrspacecast(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(1)
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = addrspacecast ptr undef to ptr addrspace(1)
+  %x = addrspacecast ptr %arg to ptr addrspace(1)
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptoui(
-; CHECK: fptoui
-; CHECK: br i1 true
-define void @ifThen_fptoui() {
+define void @ifThen_fptoui(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptoui(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptoui float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptoui float undef to i32
+  %x = fptoui float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptosi(
-; CHECK: fptosi
-; CHECK: br i1 true
-define void @ifThen_fptosi() {
+define void @ifThen_fptosi(float %arg) {
+; CHECK-LABEL: define void @ifThen_fptosi(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptosi float [[ARG]] to i32
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptosi float undef to i32
+  %x = fptosi float %arg to i32
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_uitofp(
-; CHECK: uitofp
-; CHECK: br i1 true
-define void @ifThen_uitofp() {
+define void @ifThen_uitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_uitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = uitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = uitofp i32 undef to float
+  %x = uitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_sitofp(
-; CHECK: sitofp
-; CHECK: br i1 true
-define void @ifThen_sitofp() {
+define void @ifThen_sitofp(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_sitofp(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = sitofp i32 [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = sitofp i32 undef to float
+  %x = sitofp i32 %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fpext(
-; CHECK: fpext
-; CHECK: br i1 true
-define void @ifThen_fpext() {
+define void @ifThen_fpext(float %arg) {
+; CHECK-LABEL: define void @ifThen_fpext(
+; CHECK-SAME: float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fpext float [[ARG]] to double
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fpext float undef to double
+  %x = fpext float %arg to double
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_fptrunc(
-; CHECK: fptrunc
-; CHECK: br i1 true
-define void @ifThen_fptrunc() {
+define void @ifThen_fptrunc(double %arg) {
+; CHECK-LABEL: define void @ifThen_fptrunc(
+; CHECK-SAME: double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = fptrunc double [[ARG]] to float
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = fptrunc double undef to float
+  %x = fptrunc double %arg to float
   br label %b
 
 b:
   ret void
 }
 
-; CHECK-LABEL: @ifThen_trunc(
-; CHECK: trunc
-; CHECK: br i1 true
-define void @ifThen_trunc() {
+define void @ifThen_trunc(i32 %arg) {
+; CHECK-LABEL: define void @ifThen_trunc(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[X:%.*]] = trunc i32 [[ARG]] to i16
+; CHECK-NEXT:    br i1 true, label %[[A:.*]], label %[[B:.*]]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br label %[[B]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    ret void
+;
   br i1 true, label %a, label %b
 a:
-  %x = trunc i32 undef to i16
+  %x = trunc i32 %arg to i16
   br label %b
 
 b:
diff --git a/llvm/test/Verifier/intrinsic-bad-arg-type.ll b/llvm/test/Verifier/intrinsic-bad-arg-type.ll
index e0684a8..e986617 100644
--- a/llvm/test/Verifier/intrinsic-bad-arg-type.ll
+++ b/llvm/test/Verifier/intrinsic-bad-arg-type.ll
@@ -4,7 +4,7 @@
 ; CHECK-NEXT: ptr @llvm.masked.load.nxv4i32.p0
 
 define <vscale x 4 x i32> @masked_load(ptr %addr, <4 x i1> %mask, <vscale x 4 x i32> %dst) {
-  %res = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %addr, i32 4, <4 x i1> %mask, <vscale x 4 x i32> %dst)
+  %res = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %addr, <4 x i1> %mask, <vscale x 4 x i32> %dst)
   ret <vscale x 4 x i32> %res
 }
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr, i32, <4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr, <4 x i1>, <vscale x 4 x i32>)
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
index d5aef3d..6e68dde 100644
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -127,42 +127,6 @@ define i64 @umul_fix_sat(i64 %arg0, i64 %arg1, i32 %arg2) {
   ret i64 %ret
 }
 
-declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
-define <2 x double> @masked_load(<2 x i1> %mask, ptr %addr, <2 x double> %dst, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 %align, <2 x i1> %mask, <2 x double> %dst)
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 %align, <2 x i1> %mask, <2 x double> %dst)
-  ret <2 x double> %res
-}
-
-declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
-define void @masked_store(<4 x i1> %mask, ptr %addr, <4 x i32> %val, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 %align, <4 x i1> %mask)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 %align, <4 x i1> %mask)
-  ret void
-}
-
-declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
-define <2 x double> @test_gather(<2 x ptr> %ptrs, <2 x i1> %mask, <2 x double> %src0, i32 %align)  {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK: i32 %align
-  ; CHECK: %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 %align, <2 x i1> %mask, <2 x double> %src0)
-  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 %align, <2 x i1> %mask, <2 x double> %src0)
-  ret <2 x double> %res
-}
-
-declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
-define void @test_scatter_8i32(<8 x i32> %a1, <8 x ptr> %ptr, <8 x i1> %mask, i32 %align) {
-  ; CHECK: immarg operand has non-immediate parameter
-  ; CHECK-NEXT: i32 %align
-  ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 %align, <8 x i1> %mask)
-  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 %align, <8 x i1> %mask)
-  ret void
-}
-
 declare ptr @llvm.invariant.start.p0(i64, ptr)
 define void @test_invariant_start(i64 %arg0, ptr %ptr) {
   ; CHECK: immarg operand has non-immediate parameter
diff --git a/llvm/test/Verifier/masked-load.ll b/llvm/test/Verifier/masked-load.ll
deleted file mode 100644
index 1522958..0000000
--- a/llvm/test/Verifier/masked-load.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-declare <2 x double> @llvm.masked.load.v2f64.p0(ptr, i32, <2 x i1>, <2 x double>)
-
-define <2 x double> @masked_load(<2 x i1> %mask, ptr %addr, <2 x double> %dst) {
-  ; CHECK: masked_load: alignment must be a power of 2
-  ; CHECK-NEXT: %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 3, <2 x i1> %mask, <2 x double> %dst)
-  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %addr, i32 3, <2 x i1>%mask, <2 x double> %dst)
-  ret <2 x double> %res
-}
diff --git a/llvm/test/Verifier/masked-store.ll b/llvm/test/Verifier/masked-store.ll
deleted file mode 100644
index 324adbd..0000000
--- a/llvm/test/Verifier/masked-store.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
-
-define void @masked_store(<4 x i1> %mask, ptr %addr, <4 x i32> %val) {
-  ; CHECK: masked_store: alignment must be a power of 2
-  ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 3, <4 x i1> %mask)
-  call void @llvm.masked.store.v4i32.p0(<4 x i32> %val, ptr %addr, i32 3, <4 x i1> %mask)
-  ret void
-}
diff --git a/llvm/test/Verifier/opaque-ptr.ll b/llvm/test/Verifier/opaque-ptr.ll
index 3ac9044..e58fb56 100644
--- a/llvm/test/Verifier/opaque-ptr.ll
+++ b/llvm/test/Verifier/opaque-ptr.ll
@@ -52,9 +52,9 @@ define void @opaque_mangle() {
 
 define void @intrinsic_calls(ptr %a) {
 ; CHECK-LABEL: @intrinsic_calls(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[A:%.*]], i32 4, <2 x i1> zeroinitializer, <2 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.store.v2i32.p0(<2 x i32> zeroinitializer, ptr [[A]], i32 4, <2 x i1> zeroinitializer)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> zeroinitializer, i32 4, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr align 4 [[A:%.*]], <2 x i1> zeroinitializer, <2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v2i32.p0(<2 x i32> zeroinitializer, ptr align 4 [[A]], <2 x i1> zeroinitializer)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> align 4 zeroinitializer, <2 x i1> zeroinitializer, <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) null, i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Verifier/scatter_gather.ll b/llvm/test/Verifier/scatter_gather.ll
index 53be502..1d6c36b 100644
--- a/llvm/test/Verifier/scatter_gather.ll
+++ b/llvm/test/Verifier/scatter_gather.ll
@@ -3,112 +3,112 @@
 ; Mask is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 define <16 x float> @gather2(<16 x ptr> %ptrs, ptr %mask, <16 x float> %passthru) {
-  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, ptr %mask, <16 x float> %passthru)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, ptr %mask, <16 x float> %passthru)
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, ptr, <16 x float>)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, ptr, <16 x float>)
 
 ; Mask length != return length
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather3(<8 x ptr> %ptrs, <16 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, i32 4, <16 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %ptrs, <16 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <16 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, <16 x i1>, <8 x float>)
 
 ; Return type is not a vector
 ; CHECK: Intrinsic has incorrect return type!
 define ptr @gather4(<8 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call ptr @llvm.masked.gather.p0.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call ptr @llvm.masked.gather.p0.v8p0(<8 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret ptr %res
 }
-declare ptr @llvm.masked.gather.p0.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
+declare ptr @llvm.masked.gather.p0.v8p0(<8 x ptr>, <8 x i1>, <8 x float>)
 
 ; Value type is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather5(ptr %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.p0(ptr %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.p0(ptr %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.p0(ptr, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.p0(ptr, <8 x i1>, <8 x float>)
 
 ; Value type is not a vector of pointers
 ; CHECK: Intrinsic has incorrect argument type!
 define <8 x float> @gather6(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, <8 x i1>, <8 x float>)
 
 ; Value length!= vector of pointers length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.gather.v8f32.v16p0
 define <8 x float> @gather8(<16 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
-  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr> %ptrs, <8 x i1> %mask, <8 x float> %passthru)
   ret <8 x float> %res
 }
-declare <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr>, i32, <8 x i1>, <8 x float>)
+declare <8 x float> @llvm.masked.gather.v8f32.v16p0(<16 x ptr>, <8 x i1>, <8 x float>)
 
 ; Passthru type doesn't match return type
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.gather.v16i32.v16p0
 define <16 x i32> @gather9(<16 x ptr> %ptrs, <16 x i1> %mask, <8 x i32> %passthru) {
-  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <8 x i32> %passthru)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, <16 x i1> %mask, <8 x i32> %passthru)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <8 x i32>)
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, <16 x i1>, <8 x i32>)
 
 ; Mask is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v16f32.v16p0
 define void @scatter2(<16 x float> %value, <16 x ptr> %ptrs, ptr %mask) {
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %value, <16 x ptr> %ptrs, i32 4, ptr %mask)
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %value, <16 x ptr> %ptrs, ptr %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, ptr)
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, ptr)
 
 ; Mask length != value length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v8p0
 define void @scatter3(<8 x float> %value, <8 x ptr> %ptrs, <16 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, <16 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, <16 x i1>)
 
 ; Value type is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.p0.v8p0
 define void @scatter4(ptr %value, <8 x ptr> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.p0.v8p0(ptr %value, <8 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.p0.v8p0(ptr %value, <8 x ptr> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.p0.v8p0(ptr, <8 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.p0.v8p0(ptr, <8 x ptr>, <8 x i1>)
 
 ; ptrs is not a vector
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.p0
 define void @scatter5(<8 x float> %value, ptr %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.p0(<8 x float> %value, ptr %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.p0(<8 x float> %value, ptr %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.p0(<8 x float>, ptr, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.p0(<8 x float>, ptr, <8 x i1>)
 
 ; Value type is not a vector of pointers
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v8f32
 define void @scatter6(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, <8 x i1>)
 
 ; Value length!= vector of pointers length
 ; CHECK: Intrinsic has incorrect argument type!
 ; CHECK-NEXT: ptr @llvm.masked.scatter.v8f32.v16p0
 define void @scatter8(<8 x float> %value, <16 x ptr> %ptrs, <8 x i1> %mask) {
-  call void @llvm.masked.scatter.v8f32.v16p0(<8 x float> %value, <16 x ptr> %ptrs, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8f32.v16p0(<8 x float> %value, <16 x ptr> %ptrs, <8 x i1> %mask)
   ret void
 }
-declare void @llvm.masked.scatter.v8f32.v16p0(<8 x float>, <16 x ptr>, i32, <8 x i1>)
+declare void @llvm.masked.scatter.v8f32.v16p0(<8 x float>, <16 x ptr>, <8 x i1>)
 
diff --git a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
index 42ef7f3..5f19a29 100644
--- a/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
+++ b/llvm/test/tools/llvm-gsymutil/ARM_AArch64/macho-gsym-merged-callsites-dsym.yaml
@@ -4,6 +4,7 @@
 
 # RUN: split-file %s %t
 # RUN: yaml2obj %t/merged_callsites.dSYM.yaml -o %t/merged_callsites.dSYM
+# The object file is manually tampered with such that the LLVM_stmt_seq of function_bad_stmt_seq is 0xFFFFFFFF.
 
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --callsites-yaml-file=%t/callsites.yaml -o %t/call_sites_dSYM.gsym
 # RUN: llvm-gsymutil --num-threads=1 --convert=%t/merged_callsites.dSYM --merged-functions --dwarf-callsites -o %t/dwarf_call_sites_dSYM.gsym
@@ -36,7 +37,13 @@
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
+# If we don't do anything, a function with bad LLVM_stmt_seq won't have any call site filter
+# More importantly, the line number will be at the function definition.
+# CHECK-MERGED-CALLSITES:      FunctionInfo @ 0x[[#%x,BAD_STMT_SEQ:]]: [0x[[#%x,BAD_STMT_SEQ_START:]] - 0x[[#%x,BAD_STMT_SEQ_END:]]) "function_bad_stmt_seq"
+# CHECK-MERGED-CALLSITES-NEXT: LineTable:
+# CHECK-MERGED-CALLSITES-NEXT:   0x00000001000003b0 ./merged_funcs_test.cpp:65
+
+# CHECK-MERGED-CALLSITES-NEXT: FunctionInfo @ 0x[[#%x,MAIN:]]: [0x[[#%x,MAIN_START:]] - 0x[[#%x,MAIN_END:]]) "main"
 # CHECK-MERGED-CALLSITES:      CallSites (by relative return offset):
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function1]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy2]
@@ -44,16 +51,17 @@
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function3_copy2]
 # CHECK-MERGED-CALLSITES-NEXT:   0x[[#%.4x,]] Flags[None] MatchRegex[function2_copy1]
 
-
 ### Check that we can correctly resove merged functions using callstacks:
 ### Resolve two callstacks containing merged functions.
 ### We use the value obtained from `CallSites:[FILTER]` to pass to the next call to `llvm-gsymutil` via `--merged-functions-filter`.
 ### The callstacks resolve differently based on the merged functions filter.
-###     0x00000001000003d0  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
-###     0x00000001000003e8  =========================>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003d8  =>  0x000000010000037c  =>  0x000000010000035c  =>  0x0000000100000340
+###     0x00000001000003f0  =========================>  0x000000010000035c  =>  0x0000000100000340
+###
+### We added a new function, for main function, +8 for line numbers, +0x8 for addresses.
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d0 | FileCheck --check-prefix=CHECK-C1 %s
-# CHECK-C1:       0x00000001000003d0: main + 32 @ ./merged_funcs_test.cpp:63
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x00000001000003d8 | FileCheck --check-prefix=CHECK-C1 %s
+# CHECK-C1:       0x00000001000003d8: main + 32 @ ./merged_funcs_test.cpp:71
 # CHECK-C1-NEXT:      CallSites: function2_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000037c --merged-functions-filter="function2_copy2" | FileCheck --check-prefix=CHECK-C2 %s
@@ -73,9 +81,9 @@
 ### ----------------------------------------------------------------------------------------------------------------------------------
 ### Resolve the 2nd call stack - the 2nd and 3rd addresses are the same but they resolve to a different function because of the filter
 
-# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003e8 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
-# CHECK-C5:       Found 1 function at address 0x00000001000003e8:
-# CHECK-C5-NEXT:     0x00000001000003e8: main + 56 @ ./merged_funcs_test.cpp:64
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003f0 --merged-functions | FileCheck --check-prefix=CHECK-C5 %s
+# CHECK-C5:       Found 1 function at address 0x00000001000003f0:
+# CHECK-C5-NEXT:     0x00000001000003f0: main + 56 @ ./merged_funcs_test.cpp:72
 # CHECK-C5-NEXT:        CallSites: function3_copy2
 
 # RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --merged-functions --address=0x000000010000035c --merged-functions-filter="function3_copy2" | FileCheck --check-prefix=CHECK-C6 %s
@@ -87,6 +95,9 @@
 # CHECK-C7:       Found 1 function at address 0x0000000100000340:
 # CHECK-C7-NEXT:     0x0000000100000340: function4_copy2 + 8 @ ./merged_funcs_test.cpp:14
 
+# RUN: llvm-gsymutil %t/dwarf_call_sites_dSYM.gsym --address=0x00000001000003b4 --merged-functions | FileCheck --check-prefix=CHECK-BAD %s
+# CHECK-BAD:       Found 1 function at address 0x00000001000003b4:
+# CHECK-BAD-NEXT:     0x00000001000003b4: function_bad_stmt_seq + 4 @ ./merged_funcs_test.cpp:65
 
 #--- merged_funcs_test.cpp
 #define ATTRIB extern "C" __attribute__((noinline))
@@ -148,6 +159,14 @@ ATTRIB int function1(int a) {
     return result;
 }
 
+// Intentional multi-line function definition
+ATTRIB
+int function_bad_stmt_seq(
+    int a
+) {
+    return a + 1;
+}
+
 int main() {
     int sum = 0;
     sum += function1(1);
@@ -155,6 +174,7 @@ int main() {
     sum += function4_copy2(4);
     sum += function3_copy2(41);
     sum += function2_copy1(11);
+    sum += function_bad_stmt_seq(sum);
     return sum;
 }
 
@@ -229,7 +249,7 @@ FileHeader:
 LoadCommands:
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C441A-5555-3144-A124-E395C0E7AA96
+    uuid:            4C4C44C9-5555-3144-A16C-82A90B2087B3
   - cmd:             LC_BUILD_VERSION
     cmdsize:         24
     platform:        1
@@ -239,9 +259,9 @@ LoadCommands:
   - cmd:             LC_SYMTAB
     cmdsize:         24
     symoff:          4096
-    nsyms:           10
-    stroff:          4256
-    strsize:         156
+    nsyms:           11
+    stroff:          4272
+    strsize:         179
   - cmd:             LC_SEGMENT_64
     cmdsize:         72
     segname:         __PAGEZERO
@@ -268,7 +288,7 @@ LoadCommands:
       - sectname:        __text
         segname:         __TEXT
         addr:            0x100000338
-        size:            208
+        size:            228
         offset:          0x0
         align:           2
         reloff:          0x0
@@ -277,7 +297,7 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C441A55553144A124E395C0E7AA9632000000180000000100000000000B0000000B00000000000200000018000000001000000A000000A01000009C00000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F54455854000000000000000000000000000001000000
+        content:         CFFAEDFE0C000001000000000A00000008000000C005000000000000000000001B000000180000004C4C44C955553144A16C82A90B2087B332000000180000000100000000000B0000000B00000000000200000018000000001000000B000000B0100000B300000019000000480000005F5F504147455A45524F00000000000000000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000019000000980000005F5F544558540000000000000000000000000000010000000040000000000000000000000000000000000000
   - cmd:             LC_SEGMENT_64
     cmdsize:         152
     segname:         __DATA
@@ -308,7 +328,7 @@ LoadCommands:
     vmaddr:          4295000064
     vmsize:          4096
     fileoff:         4096
-    filesize:        316
+    filesize:        355
     maxprot:         1
     initprot:        1
     nsects:          0
@@ -319,7 +339,7 @@ LoadCommands:
     vmaddr:          4295004160
     vmsize:          4096
     fileoff:         8192
-    filesize:        3507
+    filesize:        3884
     maxprot:         7
     initprot:        3
     nsects:          11
@@ -328,7 +348,7 @@ LoadCommands:
       - sectname:        __debug_line
         segname:         __DWARF
         addr:            0x100009000
-        size:            323
+        size:            357
         offset:          0x2000
         align:           0
         reloff:          0x0
@@ -339,9 +359,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_aranges
         segname:         __DWARF
-        addr:            0x100009143
+        addr:            0x100009165
         size:            48
-        offset:          0x2143
+        offset:          0x2165
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -351,9 +371,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_loc
         segname:         __DWARF
-        addr:            0x100009173
-        size:            1026
-        offset:          0x2173
+        addr:            0x100009195
+        size:            1083
+        offset:          0x2195
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -361,12 +381,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000084000000000000009000000000000000030011009F90000000000000009C000000000000000100639C00000000000000A800000000000000010064B800000000000000C00000000000000001006300000000000000000000000000000000
+        content:         00000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000000000000000000000800000000000000010050080000000000000014000000000000000400A301509F0000000000000000000000000000000004000000000000000C0000000000000001005800000000000000000000000000000000080000000000000014000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000014000000000000002000000000000000010050200000000000000034000000000000000400A301509F00000000000000000000000000000000240000000000000034000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000034000000000000004000000000000000010050400000000000000058000000000000000400A301509F000000000000000000000000000000003C0000000000000040000000000000000300707E9F000000000000000000000000000000004400000000000000580000000000000001005000000000000000000000000000000000340000000000000040000000000000000300707E9F00000000000000000000000000000000440000000000000058000000000000000100500000000000000000000000000000000058000000000000006400000000000000010050640000000000000078000000000000000400A301509F00000000000000000000000000000000680000000000000078000000000000000100500000000000000000000000000000000078000000000000007C000000000000000100507C0000000000000080000000000000000400A301509F000000000000000000000000000000008C000000000000009800000000000000030011009F9800000000000000A400000000000000010063A400000000000000B000000000000000010064C000000000000000D40000000000000001006300000000000000000000000000000000
       - sectname:        __debug_info
         segname:         __DWARF
-        addr:            0x100009575
-        size:            923
-        offset:          0x2575
+        addr:            0x1000095D0
+        size:            988
+        offset:          0x25D0
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -376,9 +396,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_frame
         segname:         __DWARF
-        addr:            0x100009910
-        size:            272
-        offset:          0x2910
+        addr:            0x1000099AC
+        size:            296
+        offset:          0x29AC
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -386,12 +406,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D022400000000000000B00300000100000058000000000000004C0C1D109E019D029303940400000000
+        content:         14000000FFFFFFFF0400080001781E0C1F000000000000001400000000000000380300000100000014000000000000001400000000000000380300000100000014000000000000001C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000004C030000010000002000000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C000000000000006C030000010000002400000000000000480C1D109E019D021C0000000000000090030000010000002000000000000000480C1D109E019D021400000000000000B00300000100000008000000000000002400000000000000B80300000100000064000000000000004C0C1D109E019D029303940400000000
       - sectname:        __debug_abbrev
         segname:         __DWARF
-        addr:            0x100009A20
+        addr:            0x100009AD4
         size:            260
-        offset:          0x2A20
+        offset:          0x2AD4
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -401,9 +421,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __debug_str
         segname:         __DWARF
-        addr:            0x100009B24
-        size:            188
-        offset:          0x2B24
+        addr:            0x100009BD8
+        size:            357
+        offset:          0x2BD8
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -413,9 +433,9 @@ LoadCommands:
         reserved3:       0x0
       - sectname:        __apple_namespac
         segname:         __DWARF
-        addr:            0x100009BE0
+        addr:            0x100009D3D
         size:            36
-        offset:          0x2BE0
+        offset:          0x2D3D
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -426,9 +446,9 @@ LoadCommands:
         content:         485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF
       - sectname:        __apple_names
         segname:         __DWARF
-        addr:            0x100009C04
-        size:            316
-        offset:          0x2C04
+        addr:            0x100009D61
+        size:            344
+        offset:          0x2D61
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -436,12 +456,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000A0000000A0000000C0000000000000001000000010006000000000002000000030000000400000006000000FFFFFFFF0800000009000000FFFFFFFFFFFFFFFF88CB36CFF4B03BD389CB36CF0A452B694908311C0B452B694A08311CDC41AB586A7F9A7CAD7ED75898000000A8000000B8000000C8000000D8000000E8000000F80000000801000018010000280100008900000001000000BB010000000000001B000000010000002E0000000000000099000000010000003A020000000000002D000000010000004F000000000000005800000001000000E50000000000000048000000010000009A0000000000000068000000010000003901000000000000A900000001000000B902000000000000B3000000010000000D030000000000007800000002000000050200008402000000000000
+        content:         48534148010000000B0000000B0000000C0000000000000001000000010006000000000002000000FFFFFFFFFFFFFFFF03000000FFFFFFFFFFFFFFFF04000000080000000A000000FFFFFFFFAD7ED75888CB36CF89CB36CFF4B03BD34908311CDC41AB580A452B696A7F9A7C4A08311C0B452B69404C8F68A4000000B8000000C8000000D8000000E8000000F800000008010000180100002801000038010000480100000B010000020000000502000084020000000000001C01000001000000BB010000000000002C010000010000003A02000000000000AE000000010000002E00000000000000EB00000001000000E5000000000000003C01000001000000B902000000000000C0000000010000004F000000000000005C010000010000003A03000000000000FB000000010000003901000000000000DB000000010000009A0000000000000046010000010000000D03000000000000
       - sectname:        __apple_types
         segname:         __DWARF
-        addr:            0x100009D40
+        addr:            0x100009EB9
         size:            79
-        offset:          0x2D40
+        offset:          0x2EB9
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -449,12 +469,12 @@ LoadCommands:
         reserved1:       0x0
         reserved2:       0x0
         reserved3:       0x0
-        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000290000000100000048000000240000A4283A0C00000000
+        content:         48534148010000000100000001000000180000000000000004000000010006000300050005000B0006000600000000003080880B38000000BC0000000100000048000000240000A4283A0C00000000
       - sectname:        __apple_objc
         segname:         __DWARF
-        addr:            0x100009D8F
+        addr:            0x100009F08
         size:            36
-        offset:          0x2D8F
+        offset:          0x2F08
         align:           0
         reloff:          0x0
         nreloc:          0
@@ -469,7 +489,7 @@ LinkEditData:
       n_type:          0xF
       n_sect:          1
       n_desc:          0
-      n_value:         4294968240
+      n_value:         4294968248
     - n_strx:          8
       n_type:          0xF
       n_sect:          1
@@ -507,10 +527,15 @@ LinkEditData:
       n_value:         4294968208
     - n_strx:          121
       n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294968240
+    - n_strx:          144
+      n_type:          0xF
       n_sect:          2
       n_desc:          0
       n_value:         4294983680
-    - n_strx:          136
+    - n_strx:          159
       n_type:          0xF
       n_sect:          1
       n_desc:          16
@@ -526,11 +551,13 @@ LinkEditData:
     - _function2_copy1
     - _function2_copy2
     - _function1
+    - _function_bad_stmt_seq
     - _global_result
     - __mh_execute_header
 DWARF:
   debug_str:
     - ''
+    - 'Facebook clang version 15.82.1 (https://git.internal.tfbnw.net/repos/git/ro/osmeta/external/llvm-project 3c2ff7d3c6ef63fb9a83574cee1b376c969bd5ec)'
     - merged_funcs_test.cpp
     - '/'
     - .
@@ -547,6 +574,7 @@ DWARF:
     - function2_copy1
     - function2_copy2
     - function1
+    - function_bad_stmt_seq
     - main
     - sum
   debug_abbrev:
@@ -793,9 +821,9 @@ DWARF:
       AddressSize:     0x8
       Descriptors:
         - Address:         0x100000338
-          Length:          0xD0
+          Length:          0xE4
   debug_info:
-    - Length:          0x397
+    - Length:          0x3D8
       Version:         4
       AbbrevTableID:   0
       AbbrOffset:      0x0
@@ -803,31 +831,31 @@ DWARF:
       Entries:
         - AbbrCode:        0x1
           Values:
-            - Value:           0x0
-            - Value:           0x21
             - Value:           0x1
-            - Value:           0x17
+            - Value:           0x21
+            - Value:           0x94
+            - Value:           0xAA
             - Value:           0x0
-            - Value:           0x19
+            - Value:           0xAC
             - Value:           0x1
             - Value:           0x100000338
-            - Value:           0xD0
+            - Value:           0xE4
         - AbbrCode:        0x2
           Values:
-            - Value:           0x1B
+            - Value:           0xAE
             - Value:           0x43
             - Value:           0x1
             - Value:           0x1
             - Value:           0x2
             - Value:           0x9
-              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0, 
+              BlockData:       [ 0x3, 0x0, 0x40, 0x0, 0x0, 0x1, 0x0, 0x0,
                                  0x0 ]
         - AbbrCode:        0x3
           Values:
             - Value:           0x48
         - AbbrCode:        0x4
           Values:
-            - Value:           0x29
+            - Value:           0xBC
             - Value:           0x5
             - Value:           0x4
         - AbbrCode:        0x5
@@ -839,7 +867,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x2D
+            - Value:           0xC0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
@@ -848,21 +876,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x0
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x4
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x39
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x5
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x5C
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x6
             - Value:           0x48
@@ -876,7 +904,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6F ]
             - Value:           0x1
-            - Value:           0x48
+            - Value:           0xDB
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
@@ -885,21 +913,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x7F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0xB
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xB8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0xC
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0xDB
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0xD
             - Value:           0x48
@@ -912,7 +940,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x58
+            - Value:           0xEB
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
@@ -921,20 +949,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0xFE
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x12
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x137
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x14
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x13
             - Value:           0x48
@@ -951,7 +979,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x68
+            - Value:           0xFB
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
@@ -960,20 +988,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x15A
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x19
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x193
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x1B
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x1A
             - Value:           0x48
@@ -984,7 +1012,7 @@ DWARF:
         - AbbrCode:        0x0
         - AbbrCode:        0xB
           Values:
-            - Value:           0x78
+            - Value:           0x10B
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
@@ -993,19 +1021,19 @@ DWARF:
             - Value:           0x1
         - AbbrCode:        0xC
           Values:
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x20
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x22
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x21
             - Value:           0x48
@@ -1018,7 +1046,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x89
+            - Value:           0x11C
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
@@ -1027,21 +1055,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x1B6
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x27
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x1EF
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x28
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x214
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x29
             - Value:           0x48
@@ -1075,7 +1103,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0x99
+            - Value:           0x12C
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
@@ -1084,21 +1112,21 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x27F
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x2E
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2B8
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x2F
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x2DD
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x30
             - Value:           0x48
@@ -1132,7 +1160,7 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xA9
+            - Value:           0x13C
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
@@ -1141,20 +1169,20 @@ DWARF:
         - AbbrCode:        0x6
           Values:
             - Value:           0x348
-            - Value:           0x3D
+            - Value:           0xD0
             - Value:           0x1
             - Value:           0x35
             - Value:           0x48
         - AbbrCode:        0x7
           Values:
             - Value:           0x381
-            - Value:           0x41
+            - Value:           0xD4
             - Value:           0x1
             - Value:           0x37
             - Value:           0x48
         - AbbrCode:        0x9
           Values:
-            - Value:           0x3F
+            - Value:           0xD2
             - Value:           0x1
             - Value:           0x36
             - Value:           0x48
@@ -1163,31 +1191,54 @@ DWARF:
             - Value:           0x1BB
             - Value:           0x1000003A0
         - AbbrCode:        0x0
-        - AbbrCode:        0x8
+        - AbbrCode:        0x5
           Values:
             - Value:           0x1000003B0
-            - Value:           0x58
-            - Value:           0x112
+            - Value:           0x8
+            - Value:           0x1
+            - Value:           0xFFFFFFFF
+            - Value:           0x1
+              BlockData:       [ 0x6F ]
+            - Value:           0x1
+            - Value:           0x146
+            - Value:           0x1
+            - Value:           0x3E
+            - Value:           0x48
+            - Value:           0x1
+            - Value:           0x1
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0x3A4
+            - Value:           0xD0
+            - Value:           0x1
+            - Value:           0x3F
+            - Value:           0x48
+        - AbbrCode:        0x0
+        - AbbrCode:        0x8
+          Values:
+            - Value:           0x1000003B8
+            - Value:           0x64
+            - Value:           0x12B
             - Value:           0x1
               BlockData:       [ 0x6D ]
             - Value:           0x1
-            - Value:           0xB3
+            - Value:           0x15C
             - Value:           0x1
-            - Value:           0x3C
+            - Value:           0x44
             - Value:           0x48
             - Value:           0x1
             - Value:           0x1
         - AbbrCode:        0x7
           Values:
-            - Value:           0x3A4
-            - Value:           0xB8
+            - Value:           0x3DD
+            - Value:           0x161
             - Value:           0x1
-            - Value:           0x3D
+            - Value:           0x45
             - Value:           0x48
         - AbbrCode:        0x10
           Values:
             - Value:           0x2B9
-            - Value:           0x1000003C4
+            - Value:           0x1000003CC
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1198,7 +1249,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x23A
-            - Value:           0x1000003D0
+            - Value:           0x1000003D8
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1209,7 +1260,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x9A
-            - Value:           0x1000003DC
+            - Value:           0x1000003E4
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1220,7 +1271,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x139
-            - Value:           0x1000003E8
+            - Value:           0x1000003F0
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1231,7 +1282,7 @@ DWARF:
         - AbbrCode:        0x10
           Values:
             - Value:           0x1BB
-            - Value:           0x1000003F8
+            - Value:           0x100000400
         - AbbrCode:        0x11
           Values:
             - Value:           0x1
@@ -1239,10 +1290,21 @@ DWARF:
             - Value:           0x1
               BlockData:       [ 0x3B ]
         - AbbrCode:        0x0
+        - AbbrCode:        0x10
+          Values:
+            - Value:           0x30D
+            - Value:           0x10000040C
+        - AbbrCode:        0x11
+          Values:
+            - Value:           0x1
+              BlockData:       [ 0x50 ]
+            - Value:           0x2
+              BlockData:       [ 0x83, 0x0 ]
+        - AbbrCode:        0x0
         - AbbrCode:        0x0
         - AbbrCode:        0x0
   debug_line:
-    - Length:          319
+    - Length:          353
       Version:         4
       PrologueLength:  45
       MinInstLength:   1
@@ -1495,8 +1557,31 @@ DWARF:
           ExtLen:          9
           SubOpcode:       DW_LNE_set_address
           Data:            4294968240
+        - Opcode:          DW_LNS_set_column
+          Data:            14
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            0
         - Opcode:          DW_LNS_advance_line
-          SData:           59
+          SData:           64
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            5
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4A
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294968248
+        - Opcode:          DW_LNS_advance_line
+          SData:           67
           Data:            0
         - Opcode:          DW_LNS_copy
           Data:            0
@@ -1539,6 +1624,18 @@ DWARF:
         - Opcode:          0x82
           Data:            0
         - Opcode:          DW_LNS_set_column
+          Data:            12
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x4B
+          Data:            0
+        - Opcode:          DW_LNS_set_column
+          Data:            9
+        - Opcode:          DW_LNS_negate_stmt
+          Data:            0
+        - Opcode:          0x82
+          Data:            0
+        - Opcode:          DW_LNS_set_column
           Data:            5
         - Opcode:          DW_LNS_negate_stmt
           Data:            0