48 files changed, 4219 insertions, 369 deletions
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index e784d25..acac2c9 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -447,6 +447,84 @@ bb5:
   ret void
 }
 
+define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
+; X64-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X64-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X64-NEXT:    --> (ptrtoint ptr %arg to i64) U: full-set S: full-set
+; X64-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X64-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X64-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X64-NEXT:    --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i10 = sub i64 %i9, %i4
+; X64-NEXT:    --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X64-NEXT:    --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X64-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i13 = add i8 %i12, %i8
+; X64-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X64-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT:  Loop %bb6: Trip multiple is 1
+;
+; X32-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X32-NEXT:  Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:    %i4 = ptrtoint ptr %arg to i64
+; X32-NEXT:    --> (zext i32 (ptrtoint ptr %arg to i32) to i64) U: [0,4294967296) S: [0,4294967296)
+; X32-NEXT:    %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X32-NEXT:    --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i8 = load i8, ptr %i7, align 1
+; X32-NEXT:    --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i9 = ptrtoint ptr %i7 to i64
+; X32-NEXT:    --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i10 = sub i64 %i9, %i4
+; X32-NEXT:    --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X32-NEXT:    --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:    %i12 = load i8, ptr %i11, align 1
+; X32-NEXT:    --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i13 = add i8 %i12, %i8
+; X32-NEXT:    --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT:    %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X32-NEXT:    --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT:  Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT:  Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT:  Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT:  Loop %bb6: Trip multiple is 1
+;
+  %i = icmp eq ptr %arg1, %arg
+  br i1 %i, label %bb5, label %bb3
+
+bb3:
+  %i4 = ptrtoint ptr %arg to i64
+  br label %bb6
+
+bb6:
+  %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+  %i8 = load i8, ptr %i7
+  %i9 = ptrtoint ptr %i7 to i64
+  %i10 = sub i64 %i9, %i4
+  %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+  %i12 = load i8, ptr %i11
+  %i13 = add i8 %i12, %i8
+  store i8 %i13, ptr %i11
+  %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+  %i15 = icmp eq ptr %i14, %arg1
+  br i1 %i15, label %bb5, label %bb6
+
+bb5:
+  ret void
+}
+
+
 ; void pr46786_c26_int(int* start, int *end, int *other) {
 ;   for (int* cur = start; cur != end; ++cur)
 ;     other[cur - start] += *cur;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
index 8552931..ee35447 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -102,8 +102,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_ASHR %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
index 61d1c43..97bcb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
@@ -135,8 +135,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_SHL %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
new file mode 100644
index 0000000..332049d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
@@ -0,0 +1,276 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 2
+    %1:_(s8) = G_CONSTANT i8 224
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstZero
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegFour
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegFour
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5
+  ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 4
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNeg
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNeg
+  ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 224
+    %1:_(s8) = G_CONSTANT i8 2
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 0
+    %4:_(s8) = G_SUB %3, %2
+...
+---
+name:            ScalarLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %1, %0
+...
+---
+name:            ScalarPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 5
+    %4:_(s8) = G_SUB %2, %3
+...
+---
+name:            VectorCstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstZero
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_SUB %1, %2
+...
+---
+name:            VectorCstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(s16) = G_CONSTANT i16 1
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_SUB %0, %1
+...
+---
+name:            VectorRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %2, %0
+...
+---
+name:            VectorNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 0
+    %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+    %6:_(<4 x s16>) = G_SUB %5, %3
+...
+---
+name:            VectorLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %0, %2
+...
+---
+name:            VectorPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+  ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+  ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+  ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 42
+    %5:_(s16) = G_CONSTANT i16 74
+    %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+    %7:_(<4 x s16>) = G_SUB %6, %3
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12
+    %0:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = G_CONSTANT i16 6
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_SUB %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index aa070b7..9b456a5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -22,10 +22,8 @@ entry:
 define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) {
 ; CHECK-LABEL: adds_cmn_c:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmn w0, w1
-; CHECK-NEXT:    add w1, w1, w0
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    adds w1, w0, w1
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6..149b4c4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
 define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmn w1, w0
+; CHECK-NEXT:    adds w8, w1, w0
 ; CHECK-NEXT:    csinv w0, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i32 %y, -1
@@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
 define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1
-; CHECK-NEXT:    cmn x1, x0
+; CHECK-NEXT:    adds x8, x1, x0
 ; CHECK-NEXT:    csinv x0, x8, xzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i64 %y, -1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f7..b239c46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 0
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: buffer_load_lds_not_valu
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 0
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 1
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    $exec = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_128 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+    $m0 = S_MOV_B32 0
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    $m0 = S_MOV_B32 1
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 4, 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index c8fee5d..7cbe5de 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -119,9 +119,10 @@ body:             |
     ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
     %2(s16) = G_CTLZ %1
 
-    ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
-    ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
-    ; CHECK: $r0 = COPY [[R]]
+    ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
+    ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+    ; LIBCALLS: $r0 = COPY [[R]]
+    ; CLZ: $r0 = COPY [[R32]]
     %3(s32) = G_SEXT %2(s16)
     $r0 = COPY %3(s32)
     BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll
index 558e2b0..a652241 100644
--- a/llvm/test/CodeGen/ARM/carry.ll
+++ b/llvm/test/CodeGen/ARM/carry.ll
@@ -1,61 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: lsl  r
-; CHECK: orr  r
-; CHECK: rsbs r
-; CHECK: sbc  r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsl r1, r1, #1
+; CHECK-NEXT:    orr r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; add with live carry
 define i64 @f3(i32 %al, i32 %bl) {
 ; CHECK-LABEL: f3:
-; CHECK: adds r
-; CHECK: adc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    adcs r0, r1, #0
+; CHECK-NEXT:    adc r1, r2, #0
+; CHECK-NEXT:    bx lr
 entry:
-        ; unsigned wide add
-        %aw = zext i32 %al to i64
-        %bw = zext i32 %bl to i64
-        %cw = add i64 %aw, %bw
-        ; ch == carry bit
-        %ch = lshr i64 %cw, 32
-	%dw = add i64 %ch, %bw
-	ret i64 %dw
+  ; unsigned wide add
+  %aw = zext i32 %al to i64
+  %bw = zext i32 %bl to i64
+  %cw = add i64 %aw, %bw
+  ; ch == carry bit
+  %ch = lshr i64 %cw, 32
+  %dw = add i64 %ch, %bw
+  ret i64 %dw
 }
 
 ; rdar://10073745
 define i64 @f4(i64 %x) nounwind readnone {
-entry:
 ; CHECK-LABEL: f4:
-; CHECK: rsbs r
-; CHECK: rsc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    rsc r1, r1, #0
+; CHECK-NEXT:    bx lr
+entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
 
 ; rdar://12559385
 define i64 @f5(i32 %vi) {
-entry:
 ; CHECK-LABEL: f5:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    sbc r2, r2, r1
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc r1, r2, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 1edb387..f345e08 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 
 declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 2e80c4c..29b130f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
 declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 817b1d5..4e463a14 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
index cbf647f..fc8cce4 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
 declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index a37b1a9..22eb729 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b
 define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index bf2adac..33483b5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %}
 
 declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
 declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index 0636a06..ccf6541 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_st_16x64b
 define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN:     -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+  define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+    ret void
+  }
+
+  define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %1
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %2
+  }
+
+  define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+    ret void
+  }
+
+  define i64 @vsettnt_max(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettm(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettn(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettk(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define void @sf_vtzero(i64 %tm, i64 %tn) {
+  entry:
+    call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+    ret void
+  }
+
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+  declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+  declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+  declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+  declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name:            xsfmm_same_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_same_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state_bf
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state_bf
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            interleave_rvv_and_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %5:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            interleave_rvv_and_xsfmm2
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+    %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %6:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            consecutive_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+  - { reg: '$x12', virtual-reg: '%3' }
+  - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-LABEL: name: consecutive_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:vrm8 = COPY $v8m8
+    %1:gprnox0 = COPY $x10
+    %2:gprnox0 = COPY $x11
+    %3:gprnox0 = COPY $x12
+    %4:gprnox0 = COPY $x13
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+    PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+    PseudoRET
+...
+---
+name:            vsettnt_max
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: vsettnt_max
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %3:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettm
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettn
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettn
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettk
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettk
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            sf_vtzero
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+  - { id: 1, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+  - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: sf_vtzero
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = COPY $x11
+    PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll
index 1e2b332..47c7918 100644
--- a/llvm/test/CodeGen/Thumb2/carry.ll
+++ b/llvm/test/CodeGen/Thumb2/carry.ll
@@ -1,35 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f2:
-; CHECK: lsls  r1, r1, #1
-; CHECK: orr.w r1, r1, r0, lsr #31
-; CHECK: rsbs  r0, r2, r0, lsl #1
-; CHECK: sbcs  r1, r3
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    orr.w r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; rdar://12559385
 define i64 @f3(i32 %vi) {
-entry:
 ; CHECK-LABEL: f3:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbcs r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    sbcs r2, r1
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK:         .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK:         .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle2, %shuffle1
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK:         .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %mul = mul <8 x i32> %sext, %sext
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK:         .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_u
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_u
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %zext1 = zext <8 x i16> %a to <8 x i32>
+  %zext2 = zext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %zext1, %zext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK:         .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
 
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+simd128                     | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers                                      | FileCheck %s --check-prefix=NOSIMD
 
 target triple = "wasm32"
 
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %mul = fmul contract half %b, %a
+  %add = fadd contract half %mul, %c
+  ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
 define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_f64:
 ; RELAXED:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:    f64.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
   %mul = fmul contract double %b, %a
   %add = fadd contract double %mul, %c
   ret double %add
 }
 
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
 define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_4xf32:
 ; RELAXED:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD:         .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul contract <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
-
 define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_8xf16:
 ; RELAXED:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_8xf16:
 ; STRICT:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x half> %b, %a
   %add = fadd contract <8 x half> %mul, %c
   ret <8 x half> %add
 }
 
-
 define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_4xf32:
 ; RELAXED:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD:         .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
 define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_contract_4xf32:
 ; RELAXED:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD:         .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
-; TODO: This should also have relaxed_madd in RELAXED case
 define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_4xf32:
 ; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT:    return $pop1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_4xf32:
 ; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD:         .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT:    v128.store 16($0), $pop1
+; RELAXED-NEXT:    f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT:    f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT:    v128.store 0($0), $pop3
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $2, $4
+; STRICT-NEXT:    f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $1, $3
+; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD:         .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $16
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $15
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $14
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $13
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $4, $12
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $3, $11
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $2, $10
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $1, $9
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
+  %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD:         .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD:         .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
 define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fma_4xf32:
 ; RELAXED:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; STRICT-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
 ; STRICT-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
 ; STRICT-NEXT:    return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT:    call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT:    f32x4.splat $push4=, $pop3
+; NOFP16-NEXT:    f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT:    call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT:    f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT:    f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT:    call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT:    f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT:    f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT:    return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD:         .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop0
+; NOSIMD-NEXT:    call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop1
+; NOSIMD-NEXT:    call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop2
+; NOSIMD-NEXT:    call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fadd_fmul_contract_8xf32:
 ; RELAXED:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
 ; STRICT-NEXT:    v128.store 0($0), $pop3
 ; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD:         .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $16, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $15, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $14, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $13, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $12, $4
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $11, $3
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $10, $2
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $9, $1
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x float> %b, %a
   %add = fadd contract <8 x float> %mul, %c
   ret <8 x float> %add
 }
 
-
 define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_2xf64:
 ; RELAXED:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD:         .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %mul = fmul contract <2 x double> %b, %a
   %add = fadd contract <2 x double> %mul, %c
   ret <2 x double> %add
 }
 
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32.mul $push0=, $1, $0
-; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64x2.add $push1=, $pop0, $2
 ; RELAXED-NEXT:    return $pop1
 ;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f32.mul $push0=, $1, $0
-; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
-  %mul = fmul contract float %b, %a
-  %add = fadd contract float %mul, %c
-  ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD:         .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %mul = fmul <2 x double> %b, %a
+  %add = fadd <2 x double> %mul, %c
+  ret <2 x double> %add
 }
 
 define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fmaf, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call float @llvm.fma(float %a, float %b, float %c)
   ret float %fma
 }
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fma, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call double @llvm.fma(double %a, double %b, double %c)
   ret double %fma
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; RELAXED-LABEL: fsub_fmul_contract_4xf32:
 ; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
 ; RELAXED-LABEL: fsub_fmul_contract_8xf16:
 ; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_8xf16:
 ; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
   %mul = fmul contract <8 x half> %b, %a
   %sub = fsub contract <8 x half> %c, %mul
   ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fsub_fmul_contract_8xf32:
 ; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; RELAXED-LABEL: fsub_fmul_contract_2xf64:
 ; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
   ret float %sub
 }
 
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+  %fneg = fneg <8 x half> %a
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <4 x float> %a
+  %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <2 x double> %a
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a..5152c005 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
 define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v64i8_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rsi), %xmm0
-; SSE2-NEXT:    movaps 16(%rsi), %xmm1
-; SSE2-NEXT:    movaps 32(%rsi), %xmm2
-; SSE2-NEXT:    movaps 48(%rsi), %xmm3
-; SSE2-NEXT:    movups %xmm3, (%rax)
-; SSE2-NEXT:    movups %xmm2, (%rax)
-; SSE2-NEXT:    movups %xmm1, (%rax)
-; SSE2-NEXT:    movups %xmm0, (%rax)
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
+; SSE2-NEXT:    pavgb 32(%rsi), %xmm2
+; SSE2-NEXT:    pavgb 48(%rsi), %xmm3
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rsi), %ymm0
-; AVX1-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX1-NEXT:    vmovups %ymm1, (%rax)
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps (%rsi), %ymm0
-; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, (%rax)
-; AVX2-NEXT:    vmovups %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: avg_v64i8_2:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rsi), %zmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: avg_v64i8_2:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, ptr %a
   %2 = load <64 x i8>, ptr %b
   %3 = zext <64 x i8> %1 to <64 x i32>
   %4 = zext <64 x i8> %2 to <64 x i32>
-  %5 = add nuw nsw <64 x i32> %4, %4
+  %5 = add nuw nsw <64 x i32> %3, %4
   %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %8 = trunc <64 x i32> %7 to <64 x i8>
@@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
   ret void
 }
 
-
 define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v4i16_2:
 ; SSE2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..f3950b7 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,16 +1,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
-
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+;; A minimal test case. Subsequent PRs will expand on this test case
+;; (e.g., with more functions, variables and profiles) and test the hotness
+;; reconcillation implementation.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
 ; RUN:     -partition-static-data-sections=true \
 ; RUN:     -data-sections=true  -unique-section-names=false \
-; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=IR
 
-; ERR: Global variable hot_bss already has a section prefix hot
+; IR: .section .bss.hot.,"aw"
 
 @hot_bss = internal global i32 0, !section_prefix !17
 
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; UNIQ-NEXT:   .section	.data.unlikely.,"aw",@progbits,unique,8
 ; AGG-NEXT:    .section	.data.unlikely.,"aw",@progbits
 
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; For @data_with_unknown_hotness
 ; SYM: 	       .type	.Ldata_with_unknown_hotness,@object          # @data_with_unknown_hotness
 ; SYM:         .section .data..Ldata_with_unknown_hotness,"aw",@progbits
 ; UNIQ:        .section  .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; AGG:         .data
 ; COMMON:      .Ldata_with_unknown_hotness:
 
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
 ; COMMON:      .type hot_data_custom_bar_section,@object
 ; SYM-NEXT:    .section bar,"aw",@progbits
 ; SYM:         hot_data_custom_bar_section
 ; UNIQ:        .section bar,"aw",@progbits
 ; AGG:         .section bar,"aw",@progbits
 
+; SYM:      .section .data.llvm.fake_var,"aw"
+; UNIQ:     .section .data,"aw"
+; AGG:      .data
+
+;; No section for linker declaration
+; COMMON-NOT:  qux
+
 @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @data3 = internal global i32 3
 @data_with_unknown_hotness = private global i32 5
 @hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
 
 define void @cold_func(i32 %0) !prof !15 {
   %2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll
index ea22b08..954ea8f 100644
--- a/llvm/test/CodeGen/X86/relptr-rodata.ll
+++ b/llvm/test/CodeGen/X86/relptr-rodata.ll
@@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: .long hidden-rodata
 @rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32)
 
+; CHECK: .section .rodata.rodata_ptrtoaddr
+; CHECK: rodata_ptrtoaddr:
+; CHECK: .long hidden-rodata_ptrtoaddr
+@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro1
 ; CHECK: relro1:
 ; CHECK: .long default-relro1
 @relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro1_ptrtoaddr
+; CHECK: relro1_ptrtoaddr:
+; CHECK: .long default-relro1_ptrtoaddr
+@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro2
 ; CHECK: relro2:
 ; CHECK: .long hidden-relro2
 @relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro2_ptrtoaddr
+; CHECK: relro2_ptrtoaddr:
+; CHECK: .long hidden-relro2_ptrtoaddr
+@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32)
+
 ; CHECK:      .section .rodata.obj
 ; CHECK-NEXT: .globl obj
 ; CHECK:      obj:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..69abf6e 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,158 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
   %r = icmp eq i512 %a, %b
   ret i1 %r
 }
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    orq 8(%rdi), %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp ne i128 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm0
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT:    vptest %xmm1, %xmm0
+; AVXANY-NEXT:    setb %al
+; AVXANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp eq i128 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i256_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    movq 8(%rdi), %rcx
+; ANY-NEXT:    orq 24(%rdi), %rcx
+; ANY-NEXT:    orq 16(%rdi), %rax
+; ANY-NEXT:    orq %rcx, %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp ne i256 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    andq 24(%rdi), %rcx
+; SSE-NEXT:    andq 16(%rdi), %rax
+; SSE-NEXT:    andq %rcx, %rax
+; SSE-NEXT:    cmpq $-1, %rax
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vptest %ymm1, %ymm0
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp eq i256 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i512_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq 16(%rdi), %rax
+; ANY-NEXT:    movq (%rdi), %rcx
+; ANY-NEXT:    movq 8(%rdi), %rdx
+; ANY-NEXT:    movq 24(%rdi), %rsi
+; ANY-NEXT:    orq 56(%rdi), %rsi
+; ANY-NEXT:    orq 40(%rdi), %rdx
+; ANY-NEXT:    orq %rsi, %rdx
+; ANY-NEXT:    orq 48(%rdi), %rax
+; ANY-NEXT:    orq 32(%rdi), %rcx
+; ANY-NEXT:    orq %rax, %rcx
+; ANY-NEXT:    orq %rdx, %rcx
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp ne i512 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    andq 56(%rdi), %rsi
+; NO512-NEXT:    andq 40(%rdi), %rdx
+; NO512-NEXT:    andq %rsi, %rdx
+; NO512-NEXT:    andq 48(%rdi), %rax
+; NO512-NEXT:    andq 32(%rdi), %rcx
+; NO512-NEXT:    andq %rax, %rcx
+; NO512-NEXT:    andq %rdx, %rcx
+; NO512-NEXT:    cmpq $-1, %rcx
+; NO512-NEXT:    sete %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp eq i512 %ld, -1
+  ret i1 %cmp
+}
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4b..57da338 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -917,11 +917,11 @@ main:
     # CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
     f16x8.nearest
 
-    # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
-    f16x8.relaxed_madd
+    # CHECK: f16x8.madd # encoding: [0xfd,0xce,0x02]
+    f16x8.madd
 
-    # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
-    f16x8.relaxed_nmadd
+    # CHECK: f16x8.nmadd # encoding: [0xfd,0xcf,0x02]
+    f16x8.nmadd
 
     # CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
     i16x8.trunc_sat_f16x8_s
diff --git a/llvm/test/Other/debugcounter-dce.ll b/llvm/test/Other/debugcounter-dce.ll
index 54d929f..3b1dfb4 100644
--- a/llvm/test/Other/debugcounter-dce.ll
+++ b/llvm/test/Other/debugcounter-dce.ll
@@ -1,8 +1,16 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2  < %s | FileCheck %s
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2  < %s | FileCheck %s --check-prefixes=CHECK,NO-PRINT
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 -print-debug-counter-queries < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
 ;; Test that, with debug counters on, we will skip the first DCE opportunity, perform next 2,
 ;; and ignore all the others left.
 
+; NO-PRINT-NOT: DebugCounter
+; PRINT: DebugCounter dce-transform=0 skip
+; PRINT-NEXT: DebugCounter dce-transform=1 execute
+; PRINT-NEXT: DebugCounter dce-transform=2 execute
+; PRINT-NEXT: DebugCounter dce-transform=3 skip
+; PRINT-NEXT: DebugCounter dce-transform=4 skip
+
 ; CHECK-LABEL: @test
 ; CHECK-NEXT: %add1 = add i32 1, 2
 ; CHECK-NEXT: %sub1 = sub i32 %add1, 1
diff --git a/llvm/test/TableGen/listsplat.td b/llvm/test/TableGen/listsplat.td
index 5a93a4c..43803d6 100644
--- a/llvm/test/TableGen/listsplat.td
+++ b/llvm/test/TableGen/listsplat.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
 
 // CHECK: ------------- Classes -----------------
 // CHECK-NEXT: class X<int X:a = ?, int X:b = ?> {
@@ -73,3 +74,8 @@ def DYa1 : Y<"a", 1>;
 def DYa2 : Y<"a", 2>;
 
 def DZ : X<42, !size([1, 2, 3])>;
+
+#ifdef ERROR1
+// ERROR1: !listsplat count -1 is negative
+defvar E = !listsplat("", -1);
+#endif
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index fae1365..e1d39fd 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) {
   ret float %p
 }
 
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
 define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @test_4(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823)
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823)
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double>
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 702bbbb..57184ea 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
   ret float %mul3.i.i
 }
 
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
+define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) {
+; CHECK-LABEL: @test_ui_ui_i8_mul_vec(
+; CHECK-NEXT:    [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15)
+; CHECK-NEXT:    [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15)
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half>
+; CHECK-NEXT:    [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half>
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]]
+; CHECK-NEXT:    ret <2 x half> [[R]]
+;
+  %x = and <2 x i8> %x_in, splat (i8 15)
+  %y = and <2 x i8> %y_in, splat (i8 15)
+  %xf = uitofp <2 x i8> %x to <2 x half>
+  %yf = uitofp <2 x i8> %y to <2 x half>
+  %r = fmul <2 x half> %xf, %yf
+  ret <2 x half> %r
+}
+
 define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) {
 ; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison(
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
@@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c,
 ; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
 ; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT:    [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00>
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
-; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
+; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I1]]
 ;
   %sel = select i1 %c, i32 65529, i32 53264
   %conv.i.s = trunc i32 %sel to i16
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 7b0b152..ffaa8b1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -23,10 +23,7 @@ define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
 define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
 ; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A]] to i64
-; CHECK-NEXT:    [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
-; CHECK-NEXT:    [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
-; CHECK-NEXT:    ret i32 [[TOADDR]]
+; CHECK-NEXT:    ret i32 [[A]]
 ;
   %toptr = inttoptr i32 %a to ptr addrspace(1)
   %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9ed2240..9357adf 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -273,3 +273,106 @@ loop:
 exit:
   ret void
 }
+
+define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, ptr noalias %dst) {
+; CHECK-LABEL: define void @ld_div2_ld_scevunknown_nonuniform
+; CHECK-SAME: (ptr [[SRC_A:%.*]], ptr noalias [[SRC_B:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 4
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 5
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 6
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
+; CHECK-NEXT:    [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
+; CHECK-NEXT:    [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
+; CHECK-NEXT:    [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
+; CHECK-NEXT:    [[TMP59:%.*]] = insertelement <8 x i32> [[TMP58]], i32 [[TMP51]], i32 2
+; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP52]], i32 3
+; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <8 x i32> [[TMP60]], i32 [[TMP53]], i32 4
+; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP54]], i32 5
+; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP55]], i32 6
+; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP56]], i32 7
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    store <8 x i32> [[TMP64]], ptr [[TMP65]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
+; CHECK:       scalar.ph:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i32, ptr %src.a, i64 %iv
+  %load.a = load i64, ptr %gep.a
+  %d = udiv i64 %load.a, 2
+  %gep.b = getelementptr i32, ptr %src.b, i64 %d
+  %load.b = load i32, ptr %gep.b
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+  store i32 %load.b, ptr %gep.dst
+  %iv.next = add i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv, 1000
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
new file mode 100644
index 0000000..d281905
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+
+; REQUIRES: aarch64-registered-target
+
+; See the comment in `data-layout.ll` for an explanation.
+
+target triple = "aarch64-unknown-unknown"
+
+define void @multiply(ptr %A, ptr %B, ptr %C) {
+; PTR128-LABEL: @multiply(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
+; PTR128-NEXT:    [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
+; PTR128-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
+; PTR128-NEXT:    [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR128-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR128:       alias_cont:
+; PTR128-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
+; PTR128-NEXT:    [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR128-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR128:       copy:
+; PTR128-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT:    br label [[NO_ALIAS]]
+; PTR128:       no_alias:
+; PTR128-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR128-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
+; PTR128-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
+; PTR128-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
+; PTR128-NEXT:    [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR128-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR128:       alias_cont1:
+; PTR128-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
+; PTR128-NEXT:    [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR128-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR128:       copy2:
+; PTR128-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT:    br label [[NO_ALIAS3]]
+; PTR128:       no_alias3:
+; PTR128-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR128-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR128-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR128-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR128-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR128-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR128-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR128-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR128-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR128-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR128-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR128-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR128-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
+; PTR128-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR128-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR128-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR128-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR128-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR128-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR128-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR128-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR128-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR128-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR128-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR128-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR128-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR128-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR128-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
+; PTR128-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR128-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
+; PTR128-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR128-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR128-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR128-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR128-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR128-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR128-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR128-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR128-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR128-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR128-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR128-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR128-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR128-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR128-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
+; PTR128-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR128-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
+; PTR128-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR128-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR128-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR128-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR128-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR128-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR128-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR128-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR128-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR128-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR128-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR128-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR128-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR128-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR128-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR128-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR128-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
+; PTR128-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR128-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
+; PTR128-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR128-NEXT:    ret void
+;
+; PTR64-LABEL: @multiply(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; PTR64-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
+; PTR64-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; PTR64-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR64-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR64:       alias_cont:
+; PTR64-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
+; PTR64-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR64-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR64:       copy:
+; PTR64-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT:    br label [[NO_ALIAS]]
+; PTR64:       no_alias:
+; PTR64-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR64-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
+; PTR64-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
+; PTR64-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64
+; PTR64-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR64-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR64:       alias_cont1:
+; PTR64-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
+; PTR64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR64-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR64:       copy2:
+; PTR64-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT:    br label [[NO_ALIAS3]]
+; PTR64:       no_alias3:
+; PTR64-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR64-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR64-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR64-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR64-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR64-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR64-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR64-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR64-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR64-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR64-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
+; PTR64-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR64-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR64-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR64-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR64-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR64-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR64-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR64-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR64-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR64-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR64-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR64-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR64-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR64-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
+; PTR64-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR64-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
+; PTR64-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR64-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR64-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR64-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR64-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR64-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR64-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR64-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR64-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR64-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR64-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR64-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR64-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR64-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR64-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
+; PTR64-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR64-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; PTR64-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR64-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR64-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR64-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR64-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR64-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR64-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR64-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR64-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR64-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR64-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR64-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR64-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR64-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR64-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR64-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR64-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; PTR64-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR64-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; PTR64-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT:    ret void
+;
+; PTR32-LABEL: @multiply(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32
+; PTR32-NEXT:    [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128
+; PTR32-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32
+; PTR32-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR32-NEXT:    br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR32:       alias_cont:
+; PTR32-NEXT:    [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128
+; PTR32-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR32-NEXT:    br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR32:       copy:
+; PTR32-NEXT:    [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT:    br label [[NO_ALIAS]]
+; PTR32:       no_alias:
+; PTR32-NEXT:    [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR32-NEXT:    [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32
+; PTR32-NEXT:    [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128
+; PTR32-NEXT:    [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32
+; PTR32-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR32-NEXT:    br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR32:       alias_cont1:
+; PTR32-NEXT:    [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128
+; PTR32-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR32-NEXT:    br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR32:       copy2:
+; PTR32-NEXT:    [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT:    br label [[NO_ALIAS3]]
+; PTR32:       no_alias3:
+; PTR32-NEXT:    [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR32-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR32-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR32-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR32-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR32-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR32-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR32-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR32-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR32-NEXT:    [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR32-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR32-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR32-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32
+; PTR32-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR32-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR32-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR32-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR32-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR32-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR32-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR32-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR32-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR32-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR32-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR32-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR32-NEXT:    [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR32-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR32-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16
+; PTR32-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR32-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48
+; PTR32-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR32-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR32-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR32-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR32-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR32-NEXT:    [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR32-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR32-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR32-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR32-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR32-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR32-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR32-NEXT:    [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR32-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR32-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64
+; PTR32-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR32-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96
+; PTR32-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR32-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR32-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR32-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR32-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR32-NEXT:    [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR32-NEXT:    [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR32-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR32-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR32-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR32-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR32-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR32-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR32-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR32-NEXT:    [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR32-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR32-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80
+; PTR32-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR32-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
+; PTR32-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT:    ret void
+;
+entry:
+  %a = load <16 x double>, ptr %A, align 8
+  %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
+  store <16 x double> %c, ptr %C, align 8
+  ret void
+}
+
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
new file mode 100644
index 0000000..87def6b
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
+
+; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without
+; the need to emit `libc` calls), we perform strided index calculations using
+; the same pointer bit-width as the matrix pointers, as determined by the data
+; layout. To verify this behaviour, this test runs several strided loads and
+; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
+; the generated code extends / truncates strides accordingly. Similarly,
+; `data-layout-multiply-fused.ll` adopts this approach to verify the same
+; behaviour for index calculations emitted while lowering fused matrix
+; multiplies.
+
+define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i128(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i128(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i128(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i64(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i64(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i64(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i32(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
+; PTR128-NEXT:    [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i32(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; PTR64-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i32(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]]
+; PTR32-NEXT:    [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT:    [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR128-NEXT:  entry:
+; PTR128-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR64-NEXT:  entry:
+; PTR64-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT:    ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR32-NEXT:  entry:
+; PTR32-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT:    [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT:    [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT:    [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT:    [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT:    [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT:    ret <9 x double> [[TMP2]]
+;
+entry:
+  %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3)
+  ret <9 x double> %load
+}
+
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
index ae7da19..abc4705 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32,
 define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) {
 ; CHECK-LABEL: @strided_load_4x2_stride_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT:    [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x double> [[TMP0]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
index 28e9cdb..81b8507 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride
 ; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride(
 ; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT:    [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT:    [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]]
 ; CHECK-NEXT:    store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT:    [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]]
 ; CHECK-NEXT:    store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
 
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
 ; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
 
 ;; Run optimizer pass on an IR module without IR functions, and test that global
 ;; variables in the module could be annotated (i.e., no early return),
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
 ;; Run optimizer pass on the IR, and check the section prefix.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
 
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
 ; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
 
 ; LOG: Skip annotating string literal .str
 ; LOG: Global variable var1 is annotated as hot
 ; LOG: Global variable var2.llvm.125 is annotated as hot
 ; LOG: Global variable bar is not annotated
 ; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
 
 ;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
 
 ;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
 
 ;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
 ;; prefix. Test this by testing that there is no section_prefix between @bar and
 ;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
 
 ;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
 
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
 
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
 
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
 
 ; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
 ; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
     - foo
   KnownColdStrHashes: [ 999, 1001 ]
 ...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+  - GUID:            0xdeadbeef12345678
+    AllocSites:
+      - Callstack:
+          - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+        MemInfoBlock:
+          AllocCount:      111
+          TotalSize:       222
+          TotalLifetime:   333
+          TotalLifetimeAccessDensity: 444
+    CallSites:
+      - Frames:
+        - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
 ;--- input.ll
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
 
 define i32 @func() {
   %a = load i32, ptr @var1
   %b = load i32, ptr @var2.llvm.125
-  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+  %c = load i32, ptr @llvm.fake_var
+  %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
   ret i32 %ret
 }
 
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @foo = global i8 2
 @var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
 @var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
 
 attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
index c5f72f2..fded7a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,21 +4,9 @@
 define i32 @crash_reordering_undefs() {
 ; CHECK-LABEL: @crash_reordering_undefs(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT:    [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT:    [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT:    [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT:    [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
+; CHECK-NEXT:    [[ADD0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 65537))
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
-; CHECK-NEXT:    ret i32 [[OP_RDX3]]
+; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
   %or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01..13b050d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -6,15 +6,15 @@ define i1 @test(i32 %g, i16 %d) {
 ; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i16 [[D]], 1
-; CHECK-NEXT:    [[XOR_I_I:%.*]] = xor i32 [[G]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i32> [[TMP2]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f..43302f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,32 +3,7 @@
 
 define i32 @test() {
 ; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
-; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
+; CHECK-NEXT:    [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext i8 [[TMP23]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP24]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4..3e9bd78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,12 +3,8 @@
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
-; CHECK-NEXT:    [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 034fe82..c5442b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -6,11 +6,10 @@
 define void @foo() {
 ; CHECK-LABEL: define void @foo() {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
@@ -24,11 +23,10 @@ define void @foo() {
 ;
 ; FORCED-LABEL: define void @foo() {
 ; FORCED-NEXT:  bb:
-; FORCED-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
 ; FORCED-NEXT:    br label [[BB1:%.*]]
 ; FORCED:       bb1:
 ; FORCED-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; FORCED-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; FORCED-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
 ; FORCED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; FORCED-NEXT:    [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
 ; FORCED-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21..e8078ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,23 +5,22 @@ define i32 @test(i1 %cond) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: i1 [[COND:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[OR92:%.*]] = or i32 1, 0
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
-; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i32 [ [[OR92]], %[[BB]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[OR92]] = or i32 1, 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
-; CHECK-NEXT:    [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT:    [[TMP8]] = xor <2 x i32> [[TMP9]], <i32 1, i32 0>
 ; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a..b9f8390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -8,42 +8,21 @@ define i16 @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT:    [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 0, 0
 ; CHECK-NEXT:    [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 0, 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
-; CHECK-NEXT:    [[RDX_OP:%.*]] = shufflevector <24 x i1> [[TMP29]], <24 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <28 x i1> [[RDX_OP]] to i28
-; CHECK-NEXT:    [[TMP31:%.*]] = call i28 @llvm.ctpop.i28(i28 [[TMP30]])
-; CHECK-NEXT:    [[TMP32:%.*]] = trunc i28 [[TMP31]] to i16
-; CHECK-NEXT:    [[TMP33:%.*]] = call i4 @llvm.ctpop.i4(i4 -8)
-; CHECK-NEXT:    [[TMP34:%.*]] = zext i4 [[TMP33]] to i16
-; CHECK-NEXT:    [[OP_RDX4:%.*]] = add i16 [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
+; CHECK-NEXT:    [[OP_RDX4:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-NEXT:    ret i16 [[OP_RDX4]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
index a7f8629..78708a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
@@ -6,20 +6,12 @@ define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr null, align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = and i8 0, 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i32 0, 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> <i8 0, i8 poison, i8 poison, i8 poison>, i8 [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i8> zeroinitializer, zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = and i32 0, [[TMP14]]
 ; CHECK-NEXT:    store i32 [[OP_RDX]], ptr null, align 4
diff --git a/llvm/test/Verifier/llvm.used-invalid-init.ll b/llvm/test/Verifier/llvm.used-invalid-init.ll
index 15a961c..38c84b15 100644
--- a/llvm/test/Verifier/llvm.used-invalid-init.ll
+++ b/llvm/test/Verifier/llvm.used-invalid-init.ll
@@ -2,5 +2,5 @@
 
 @llvm.used = appending global [1 x ptr] zeroinitializer, section "llvm.metadata"
 
-; CHECK: wrong initalizer for intrinsic global variable
+; CHECK: wrong initializer for intrinsic global variable
 ; CHECK-NEXT: [1 x ptr] zeroinitializer