69 files changed, 7658 insertions, 250 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
new file mode 100644
index 0000000..824ada1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-add.mir
@@ -0,0 +1,278 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:00011000 SignBits:3
+  ; CHECK-NEXT: %2:_ KnownBits:00011010 SignBits:3
+    %0:_(s8) = G_CONSTANT i8 2
+    %1:_(s8) = G_CONSTANT i8 24
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstZero
+  ; CHECK-NEXT: %0:_ KnownBits:00000001 SignBits:7
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 1
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstSeven
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstSeven
+  ; CHECK-NEXT: %0:_ KnownBits:00001000 SignBits:4
+  ; CHECK-NEXT: %1:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000111 SignBits:5
+    %0:_(s8) = G_CONSTANT i8 8
+    %1:_(s8) = G_CONSTANT i8 255
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            CstNeg
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNeg
+  ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11100010 SignBits:3
+    %0:_(s8) = G_CONSTANT i8 224
+    %1:_(s8) = G_CONSTANT i8 2
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ADD %0, %1
+...
+---
+name:            ScalarNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:11111111 SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 255
+    %4:_(s8) = G_ADD %2, %3
+...
+---
+name:            ScalarLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_ADD %1, %0
+...
+---
+name:            ScalarPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+  ; CHECK-NEXT: %4:_ KnownBits:000????? SignBits:3
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 5
+    %4:_(s8) = G_ADD %2, %3
+...
+---
+name:            VectorCstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstZero
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 1
+    %1:_(s16) = G_CONSTANT i16 65535
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name:            VectorCstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(s16) = G_CONSTANT i16 65535
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_ADD %0, %1
+...
+---
+name:            VectorRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_ADD %2, %0
+...
+---
+name:            VectorNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %5:_ KnownBits:1111111111111111 SignBits:16
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 65535
+    %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+    %6:_(<4 x s16>) = G_ADD %3, %5
+...
+---
+name:            VectorLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_ADD %0, %2
+...
+---
+name:            VectorPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+  ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+  ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+  ; CHECK-NEXT: %7:_ KnownBits:0000000????????? SignBits:7
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 42
+    %5:_(s16) = G_CONSTANT i16 74
+    %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+    %7:_(<4 x s16>) = G_ADD %6, %3
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:000000000000???? SignBits:12
+    %0:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = G_CONSTANT i16 6
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %4:_(<4 x s16>) = G_ADD %2, %3
+...
+
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_ADD %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
index 8552931..ee35447 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -102,8 +102,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_ASHR %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
index 61d1c43..97bcb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
@@ -135,8 +135,8 @@ body:             |
   ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
   ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
     %0:_(<4 x s16>) = COPY $d0
-    %2:_(s16) = COPY $h0
-    %1:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
     %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
     %4:_(<4 x s16>) = G_SHL %0, %3
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
new file mode 100644
index 0000000..332049d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
@@ -0,0 +1,276 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name:            Cst
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @Cst
+  ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 2
+    %1:_(s8) = G_CONSTANT i8 224
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstZero
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 0
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7
+  ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNegFour
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNegFour
+  ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5
+  ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(s8) = G_CONSTANT i8 4
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            CstNeg
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @CstNeg
+  ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+  ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2
+    %0:_(s8) = G_CONSTANT i8 224
+    %1:_(s8) = G_CONSTANT i8 2
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = COPY $b1
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %0, %1
+...
+---
+name:            ScalarNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 0
+    %4:_(s8) = G_SUB %3, %2
+...
+---
+name:            ScalarLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+  ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 3
+    %2:_(s8) = G_SUB %1, %0
+...
+---
+name:            ScalarPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @ScalarPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+  ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+  ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+  ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3
+    %0:_(s8) = COPY $b0
+    %1:_(s8) = G_CONSTANT i8 15
+    %2:_(s8) = G_AND %0, %1
+    %3:_(s8) = G_CONSTANT i8 5
+    %4:_(s8) = G_SUB %2, %3
+...
+---
+name:            VectorCstZero
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstZero
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_SUB %1, %2
+...
+---
+name:            VectorCstNegOne
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCstNegOne
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+  ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+    %0:_(s16) = G_CONSTANT i16 0
+    %1:_(s16) = G_CONSTANT i16 1
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+---
+name:            VectorVar
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorVar
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s16>) = G_SUB %0, %1
+...
+---
+name:            VectorRhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorRhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %2, %0
+...
+---
+name:            VectorNonNegative
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorNonNegative
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 0
+    %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+    %6:_(<4 x s16>) = G_SUB %5, %3
+...
+---
+name:            VectorLhsEarlyOut
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorLhsEarlyOut
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 3
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_SUB %0, %2
+...
+---
+name:            VectorPartKnown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorPartKnown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+  ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+  ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+  ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+  ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+  ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_CONSTANT i16 255
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+    %3:_(<4 x s16>) = G_AND %0, %2
+    %4:_(s16) = G_CONSTANT i16 42
+    %5:_(s16) = G_CONSTANT i16 74
+    %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+    %7:_(<4 x s16>) = G_SUB %6, %3
+...
+---
+name:            VectorCst36
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst36
+  ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12
+    %0:_(s16) = G_CONSTANT i16 3
+    %1:_(s16) = G_CONSTANT i16 6
+    %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+    %4:_(<4 x s16>) = G_SUB %2, %3
+...
+
+---
+name:            VectorCst3unknown
+body:             |
+  bb.1:
+  ; CHECK-LABEL: name: @VectorCst3unknown
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = COPY $h0
+    %2:_(s16) = G_CONSTANT i16 3
+    %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+    %4:_(<4 x s16>) = G_SUB %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index aa070b7..9b456a5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -22,10 +22,8 @@ entry:
 define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) {
 ; CHECK-LABEL: adds_cmn_c:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmn w0, w1
-; CHECK-NEXT:    add w1, w1, w0
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    adds w1, w0, w1
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index dc88f94..cca190f 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1774,3 +1774,88 @@ define i128 @combine_i128_sdiv_const100(i128 %x) {
   %1 = sdiv i128 %x, 100
   ret i128 %1
 }
+
+; The following only becomes an sdiv_by_one after type legalisation, after which
+; the splatted scalar constant has a different type to the splat vector. This
+; test verifies DAGCombiner does not care about this type difference.
+define <16 x i16> @combine_vec_sdiv_by_one_obfuscated(<16 x i16> %x) "target-features"="+sve" {
+; CHECK-SD-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: combine_vec_sdiv_by_one_obfuscated:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v3.8h, #1
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    mov v3.h[0], v2.h[0]
+; CHECK-GI-NEXT:    smov w9, v3.h[0]
+; CHECK-GI-NEXT:    smov w16, v3.h[7]
+; CHECK-GI-NEXT:    sdiv w14, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[1]
+; CHECK-GI-NEXT:    smov w9, v3.h[1]
+; CHECK-GI-NEXT:    sdiv w15, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[2]
+; CHECK-GI-NEXT:    smov w9, v3.h[2]
+; CHECK-GI-NEXT:    sdiv w13, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[3]
+; CHECK-GI-NEXT:    smov w9, v3.h[3]
+; CHECK-GI-NEXT:    sdiv w12, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[4]
+; CHECK-GI-NEXT:    smov w9, v3.h[4]
+; CHECK-GI-NEXT:    sdiv w11, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[5]
+; CHECK-GI-NEXT:    smov w9, v3.h[5]
+; CHECK-GI-NEXT:    sdiv w10, w8, w9
+; CHECK-GI-NEXT:    smov w8, v0.h[6]
+; CHECK-GI-NEXT:    smov w9, v3.h[6]
+; CHECK-GI-NEXT:    movi v3.8h, #1
+; CHECK-GI-NEXT:    smov w17, v3.h[0]
+; CHECK-GI-NEXT:    smov w18, v3.h[1]
+; CHECK-GI-NEXT:    smov w0, v3.h[2]
+; CHECK-GI-NEXT:    smov w1, v3.h[3]
+; CHECK-GI-NEXT:    smov w2, v3.h[4]
+; CHECK-GI-NEXT:    smov w3, v3.h[5]
+; CHECK-GI-NEXT:    sdiv w8, w8, w9
+; CHECK-GI-NEXT:    smov w9, v0.h[7]
+; CHECK-GI-NEXT:    fmov s0, w14
+; CHECK-GI-NEXT:    mov v0.h[1], w15
+; CHECK-GI-NEXT:    smov w15, v1.h[6]
+; CHECK-GI-NEXT:    mov v0.h[2], w13
+; CHECK-GI-NEXT:    sdiv w9, w9, w16
+; CHECK-GI-NEXT:    smov w16, v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], w12
+; CHECK-GI-NEXT:    smov w12, v1.h[7]
+; CHECK-GI-NEXT:    mov v0.h[4], w11
+; CHECK-GI-NEXT:    sdiv w16, w16, w17
+; CHECK-GI-NEXT:    smov w17, v1.h[1]
+; CHECK-GI-NEXT:    mov v0.h[5], w10
+; CHECK-GI-NEXT:    mov v0.h[6], w8
+; CHECK-GI-NEXT:    sdiv w17, w17, w18
+; CHECK-GI-NEXT:    smov w18, v1.h[2]
+; CHECK-GI-NEXT:    fmov s2, w16
+; CHECK-GI-NEXT:    smov w16, v3.h[6]
+; CHECK-GI-NEXT:    mov v0.h[7], w9
+; CHECK-GI-NEXT:    sdiv w18, w18, w0
+; CHECK-GI-NEXT:    smov w0, v1.h[3]
+; CHECK-GI-NEXT:    mov v2.h[1], w17
+; CHECK-GI-NEXT:    sdiv w0, w0, w1
+; CHECK-GI-NEXT:    smov w1, v1.h[4]
+; CHECK-GI-NEXT:    mov v2.h[2], w18
+; CHECK-GI-NEXT:    sdiv w1, w1, w2
+; CHECK-GI-NEXT:    smov w2, v1.h[5]
+; CHECK-GI-NEXT:    mov v2.h[3], w0
+; CHECK-GI-NEXT:    sdiv w14, w2, w3
+; CHECK-GI-NEXT:    mov v2.h[4], w1
+; CHECK-GI-NEXT:    sdiv w13, w15, w16
+; CHECK-GI-NEXT:    smov w15, v3.h[7]
+; CHECK-GI-NEXT:    mov v2.h[5], w14
+; CHECK-GI-NEXT:    sdiv w10, w12, w15
+; CHECK-GI-NEXT:    mov v2.h[6], w13
+; CHECK-GI-NEXT:    mov v2.h[7], w10
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    ret
+  %zero_and_ones = shufflevector <16 x i16> zeroinitializer, <16 x i16> splat (i16 1), <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %div = sdiv <16 x i16> %x, %zero_and_ones
+  ret <16 x i16> %div
+}
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6..149b4c4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
 define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add w8, w0, w1
-; CHECK-NEXT:    cmn w1, w0
+; CHECK-NEXT:    adds w8, w1, w0
 ; CHECK-NEXT:    csinv w0, w8, wzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i32 %y, -1
@@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
 define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
 ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, x1
-; CHECK-NEXT:    cmn x1, x0
+; CHECK-NEXT:    adds x8, x1, x0
 ; CHECK-NEXT:    csinv x0, x8, xzr, lo
 ; CHECK-NEXT:    ret
   %noty = xor i64 %y, -1
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b6dee97e..b8d6c88 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -732,6 +732,247 @@ exit:
   ret void
 }
 
+; This example corresponds to:
+;
+; __arm_agnostic("sme_za_state") void try_catch_agnostic_za_invoke()
+; {
+;    try {
+;        agnostic_za_call();
+;    } catch(...) {
+;    }
+; }
+;
+; In this example we preserve all SME state enabled by PSTATE.ZA using
+; `__arm_sme_save` before agnostic_za_call(). This is because on all normal
+; returns from an agnostic ZA function ZA state should be preserved. That means
+; we need to make sure ZA state is saved in case agnostic_za_call() throws, and
+; we need to restore ZA state after unwinding to the catch block.
+
+define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_agnostic_za_invoke:
+; CHECK:       .Lfunc_begin5:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception5
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:  .Ltmp15: // EH_LABEL
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl agnostic_za_call
+; CHECK-NEXT:  .Ltmp16: // EH_LABEL
+; CHECK-NEXT:  .LBB5_1: // %exit
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_2: // %catch
+; CHECK-NEXT:  .Ltmp17: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB5_1
+;
+; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke:
+; CHECK-SDAG:       .Lfunc_begin5:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception5
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    bl __arm_sme_state_size
+; CHECK-SDAG-NEXT:    sub sp, sp, x0
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:  .Ltmp15: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl agnostic_za_call
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:  .Ltmp16: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB5_1: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB5_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp17: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    b .LBB5_1
+entry:
+  invoke void @agnostic_za_call()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
+; This is the same `try_catch_agnostic_za_invoke`, but shows a lazy save would
+; also need to be committed in a shared-ZA function calling an agnostic-ZA function.
+define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK:       .Lfunc_begin6:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception6
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-NEXT:  .Ltmp18: // EH_LABEL
+; CHECK-NEXT:    sub x8, x29, #16
+; CHECK-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-NEXT:    bl agnostic_za_call
+; CHECK-NEXT:  .Ltmp19: // EH_LABEL
+; CHECK-NEXT:  .LBB6_1: // %exit
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB6_3
+; CHECK-NEXT:  // %bb.2: // %exit
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB6_3: // %exit
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_4: // %catch
+; CHECK-NEXT:  .Ltmp20: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB6_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee:
+; CHECK-SDAG:       .Lfunc_begin6:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception6
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp18: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl agnostic_za_call
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %entry
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_2: // %entry
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp19: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB6_3: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB6_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp20: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB6_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB6_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB6_3
+entry:
+  invoke void @agnostic_za_call()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
@@ -742,3 +983,4 @@ declare void @may_throw()
 declare void @shared_za_call() "aarch64_inout_za"
 declare void @noexcept_shared_za_call() "aarch64_inout_za"
 declare void @shared_zt0_call() "aarch64_inout_zt0"
+declare void @agnostic_za_call() "aarch64_za_state_agnostic"
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f7..b239c46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 1
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN:         .amd_kernel_code_t
+; GCN-NEXT:     amd_code_version_major = 1
+; GCN-NEXT:     amd_code_version_minor = 2
+; GCN-NEXT:     amd_machine_kind = 1
+; GCN-NEXT:     amd_machine_version_major = 6
+; GCN-NEXT:     amd_machine_version_minor = 0
+; GCN-NEXT:     amd_machine_version_stepping = 0
+; GCN-NEXT:     kernel_code_entry_byte_offset = 256
+; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
+; GCN-NEXT:     granulated_workitem_vgpr_count = 0
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 0
+; GCN-NEXT:     priority = 0
+; GCN-NEXT:     float_mode = 240
+; GCN-NEXT:     priv = 0
+; GCN-NEXT:     enable_dx10_clamp = 1
+; GCN-NEXT:     debug_mode = 0
+; GCN-NEXT:     enable_ieee_mode = 0
+; GCN-NEXT:     enable_wgp_mode = 0
+; GCN-NEXT:     enable_mem_ordered = 0
+; GCN-NEXT:     enable_fwd_progress = 0
+; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT:     user_sgpr_count = 12
+; GCN-NEXT:     enable_trap_handler = 0
+; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT:     enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT:     enable_sgpr_workgroup_info = 0
+; GCN-NEXT:     enable_vgpr_workitem_id = 2
+; GCN-NEXT:     enable_exception_msb = 0
+; GCN-NEXT:     granulated_lds_size = 0
+; GCN-NEXT:     enable_exception = 0
+; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT:     enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT:     enable_sgpr_queue_ptr = 1
+; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT:     enable_sgpr_dispatch_id = 1
+; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT:     enable_sgpr_private_segment_size = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT:     enable_wavefront_size32 = 0
+; GCN-NEXT:     enable_ordered_append_gds = 0
+; GCN-NEXT:     private_element_size = 1
+; GCN-NEXT:     is_ptr64 = 1
+; GCN-NEXT:     is_dynamic_callstack = 0
+; GCN-NEXT:     is_debug_enabled = 0
+; GCN-NEXT:     is_xnack_enabled = 0
+; GCN-NEXT:     workitem_private_segment_byte_size = 0
+; GCN-NEXT:     workgroup_group_segment_byte_size = 0
+; GCN-NEXT:     gds_segment_byte_size = 0
+; GCN-NEXT:     kernarg_segment_byte_size = 16
+; GCN-NEXT:     workgroup_fbarrier_count = 0
+; GCN-NEXT:     wavefront_sgpr_count = 4
+; GCN-NEXT:     workitem_vgpr_count = 2
+; GCN-NEXT:     reserved_vgpr_first = 0
+; GCN-NEXT:     reserved_vgpr_count = 0
+; GCN-NEXT:     reserved_sgpr_first = 0
+; GCN-NEXT:     reserved_sgpr_count = 0
+; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT:     kernarg_segment_alignment = 4
+; GCN-NEXT:     group_segment_alignment = 4
+; GCN-NEXT:     private_segment_alignment = 4
+; GCN-NEXT:     wavefront_size = 6
+; GCN-NEXT:     call_convention = -1
+; GCN-NEXT:     runtime_loader_kernel_symbol = 0
+; GCN-NEXT:    .end_amd_kernel_code_t
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
 define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
   %val0 = load volatile float, ptr addrspace(1) poison
   %val1 = load volatile float, ptr addrspace(1) poison
   %min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; CHECK-LABEL: name: buffer_load_lds_not_valu
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $exec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 0
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+    ; CHECK-NEXT: $m0 = S_MOV_B32 1
+    ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+    ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+    ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+    ; CHECK-NEXT: S_ENDPGM 0
+    $exec = IMPLICIT_DEF
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_128 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+    %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+    $m0 = S_MOV_B32 0
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    $m0 = S_MOV_B32 1
+    BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+    %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+    %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+    %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+    %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+    %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 2, 0
+    SCHED_GROUP_BARRIER 4, 1 ,0
+    SCHED_GROUP_BARRIER 2, 4, 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
index 9553fcc..f11fe4a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
@@ -59,6 +59,15 @@ body:             |
 ...
 
 ---
+name: src_shared_base_to_vcc
+body:             |
+  bb.0:
+    ; GFX9-LABEL: name: src_shared_base_to_vcc
+    ; GFX9: $vcc = S_MOV_B64 $src_shared_base
+    $vcc = COPY $src_shared_base
+...
+
+---
 name: sgpr96_aligned_src_dst
 body:             |
   bb.0:
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index c8fee5d..7cbe5de 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -119,9 +119,10 @@ body:             |
     ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
     %2(s16) = G_CTLZ %1
 
-    ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
-    ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
-    ; CHECK: $r0 = COPY [[R]]
+    ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
+    ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+    ; LIBCALLS: $r0 = COPY [[R]]
+    ; CLZ: $r0 = COPY [[R32]]
     %3(s32) = G_SEXT %2(s16)
     $r0 = COPY %3(s32)
     BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll
index 558e2b0..a652241 100644
--- a/llvm/test/CodeGen/ARM/carry.ll
+++ b/llvm/test/CodeGen/ARM/carry.ll
@@ -1,61 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
 ; CHECK-LABEL: f1:
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: lsl  r
-; CHECK: orr  r
-; CHECK: rsbs r
-; CHECK: sbc  r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsl r1, r1, #1
+; CHECK-NEXT:    orr r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbc r1, r1, r3
+; CHECK-NEXT:    bx lr
 entry:
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; add with live carry
 define i64 @f3(i32 %al, i32 %bl) {
 ; CHECK-LABEL: f3:
-; CHECK: adds r
-; CHECK: adc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    mov r2, #0
+; CHECK-NEXT:    adcs r0, r1, #0
+; CHECK-NEXT:    adc r1, r2, #0
+; CHECK-NEXT:    bx lr
 entry:
-        ; unsigned wide add
-        %aw = zext i32 %al to i64
-        %bw = zext i32 %bl to i64
-        %cw = add i64 %aw, %bw
-        ; ch == carry bit
-        %ch = lshr i64 %cw, 32
-	%dw = add i64 %ch, %bw
-	ret i64 %dw
+  ; unsigned wide add
+  %aw = zext i32 %al to i64
+  %bw = zext i32 %bl to i64
+  %cw = add i64 %aw, %bw
+  ; ch == carry bit
+  %ch = lshr i64 %cw, 32
+  %dw = add i64 %ch, %bw
+  ret i64 %dw
 }
 
 ; rdar://10073745
 define i64 @f4(i64 %x) nounwind readnone {
-entry:
 ; CHECK-LABEL: f4:
-; CHECK: rsbs r
-; CHECK: rsc r
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    rsbs r0, r0, #0
+; CHECK-NEXT:    rsc r1, r1, #0
+; CHECK-NEXT:    bx lr
+entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
 
 ; rdar://12559385
 define i64 @f5(i32 %vi) {
-entry:
 ; CHECK-LABEL: f5:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    sbc r2, r2, r1
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adc r1, r2, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index 1030917..302f70f 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck --strict-whitespace %s
 --- |
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
   target triple = "aarch64"
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 1edb387..f345e08 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 
 declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 2e80c4c..29b130f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
 declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 817b1d5..4e463a14 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
 ; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
index cbf647f..fc8cce4 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
 declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index a37b1a9..22eb729 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_ld_16x64b
 define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index bf2adac..33483b5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %}
 
 declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
 declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index 0636a06..ccf6541 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -2,9 +2,13 @@
 ; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
 ; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
 ; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
 ; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
 ; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
 
 ; CHECK-LABEL: nvvm_tcgen05_st_16x64b
 define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) {
diff --git a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll
index 0f7e0c7..1325abf 100644
--- a/llvm/test/CodeGen/PowerPC/check-zero-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/compare-vector-with-zero.ll
@@ -95,3 +95,80 @@ declare i4 @llvm.ctpop.i4(i4) #1
 !6 = !{!"short", !7, i64 0}
 !7 = !{!"omnipotent char", !8, i64 0}
 !8 = !{!"Simple C/C++ TBAA"}
+
+; Function to lockdown changes for floating point vector comparisons
+define range(i32 0, 5) i32 @cols_needed(ptr %colauths){
+; POWERPC_64LE-LABEL: cols_needed:
+; POWERPC_64LE:       # %bb.0: # %entry
+; POWERPC_64LE-NEXT:    lxv vs0, 0(r3)
+; POWERPC_64LE-NEXT:    xxlxor vs1, vs1, vs1
+; POWERPC_64LE-NEXT:    li r4, 4
+; POWERPC_64LE-NEXT:    li r3, 0
+; POWERPC_64LE-NEXT:    xvcmpeqsp vs0, vs0, vs1
+; POWERPC_64LE-NEXT:    xxlnor v2, vs0, vs0
+; POWERPC_64LE-NEXT:    vextuwrx r4, r4, v2
+; POWERPC_64LE-NEXT:    vextuwrx r3, r3, v2
+; POWERPC_64LE-NEXT:    rlwinm r4, r4, 1, 30, 30
+; POWERPC_64LE-NEXT:    sub r3, r4, r3
+; POWERPC_64LE-NEXT:    mfvsrwz r4, v2
+; POWERPC_64LE-NEXT:    rlwinm r4, r4, 2, 29, 29
+; POWERPC_64LE-NEXT:    or r3, r3, r4
+; POWERPC_64LE-NEXT:    li r4, 12
+; POWERPC_64LE-NEXT:    vextuwrx r4, r4, v2
+; POWERPC_64LE-NEXT:    slwi r4, r4, 3
+; POWERPC_64LE-NEXT:    or r3, r3, r4
+; POWERPC_64LE-NEXT:    clrlwi r3, r3, 28
+; POWERPC_64LE-NEXT:    stb r3, -1(r1)
+; POWERPC_64LE-NEXT:    lbz r3, -1(r1)
+; POWERPC_64LE-NEXT:    popcntd r3, r3
+; POWERPC_64LE-NEXT:    blr
+;
+; POWERPC_64-LABEL: cols_needed:
+; POWERPC_64:       # %bb.0: # %entry
+; POWERPC_64-NEXT:    lxv vs0, 0(r3)
+; POWERPC_64-NEXT:    xxlxor vs1, vs1, vs1
+; POWERPC_64-NEXT:    li r4, 8
+; POWERPC_64-NEXT:    xvcmpeqsp vs0, vs0, vs1
+; POWERPC_64-NEXT:    xxlnor v2, vs0, vs0
+; POWERPC_64-NEXT:    vextuwlx r4, r4, v2
+; POWERPC_64-NEXT:    mfvsrwz r3, v2
+; POWERPC_64-NEXT:    rlwinm r4, r4, 1, 30, 30
+; POWERPC_64-NEXT:    rlwimi r4, r3, 2, 29, 29
+; POWERPC_64-NEXT:    li r3, 0
+; POWERPC_64-NEXT:    vextuwlx r3, r3, v2
+; POWERPC_64-NEXT:    rlwimi r4, r3, 3, 0, 28
+; POWERPC_64-NEXT:    li r3, 12
+; POWERPC_64-NEXT:    vextuwlx r3, r3, v2
+; POWERPC_64-NEXT:    sub r3, r4, r3
+; POWERPC_64-NEXT:    clrlwi r3, r3, 28
+; POWERPC_64-NEXT:    stb r3, -1(r1)
+; POWERPC_64-NEXT:    lbz r3, -1(r1)
+; POWERPC_64-NEXT:    popcntd r3, r3
+; POWERPC_64-NEXT:    blr
+;
+; POWERPC_32-LABEL: cols_needed:
+; POWERPC_32:       # %bb.0: # %entry
+; POWERPC_32-NEXT:    lxv vs0, 0(r3)
+; POWERPC_32-NEXT:    xxlxor vs1, vs1, vs1
+; POWERPC_32-NEXT:    xvcmpeqsp vs0, vs0, vs1
+; POWERPC_32-NEXT:    xxlnor vs0, vs0, vs0
+; POWERPC_32-NEXT:    stxv vs0, -32(r1)
+; POWERPC_32-NEXT:    lwz r3, -24(r1)
+; POWERPC_32-NEXT:    lwz r4, -28(r1)
+; POWERPC_32-NEXT:    rlwinm r3, r3, 1, 30, 30
+; POWERPC_32-NEXT:    rlwimi r3, r4, 2, 29, 29
+; POWERPC_32-NEXT:    lwz r4, -32(r1)
+; POWERPC_32-NEXT:    rlwimi r3, r4, 3, 0, 28
+; POWERPC_32-NEXT:    lwz r4, -20(r1)
+; POWERPC_32-NEXT:    sub r3, r3, r4
+; POWERPC_32-NEXT:    clrlwi r3, r3, 28
+; POWERPC_32-NEXT:    popcntw r3, r3
+; POWERPC_32-NEXT:    blr
+entry:
+  %0 = load <4 x float>, ptr %colauths, align 4, !tbaa !5
+  %1 = fcmp une <4 x float> %0, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  %3 = tail call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 %2)
+  %4 = zext nneg i4 %3 to i32
+  ret i32 %4
+}
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
index 2a46a59..4f036d3 100644
--- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -221,8 +221,8 @@ define i64 @test12(i64 %0) #0 {
 ;
 ; RV64-LABEL: test12:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addiw a0, a0, -16
-; RV64-NEXT:    addi a0, a0, 13
+; RV64-NEXT:    addi a0, a0, -16
+; RV64-NEXT:    addiw a0, a0, 13
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/i64-icmp.ll b/llvm/test/CodeGen/RISCV/i64-icmp.ll
index 88d989d..2742b9a 100644
--- a/llvm/test/CodeGen/RISCV/i64-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/i64-icmp.ll
@@ -708,8 +708,7 @@ define i64 @icmp_sle_constant_neg_2050(i64 %a) nounwind {
 define i64 @icmp_eq_zext_inreg_small_constant(i64 %a) nounwind {
 ; RV64I-LABEL: icmp_eq_zext_inreg_small_constant:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, -123
+; RV64I-NEXT:    addiw a0, a0, -123
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = and i64 %a, 4294967295
@@ -748,8 +747,7 @@ define i64 @icmp_ne_zext_inreg_small_constant(i64 %a) nounwind {
 define i64 @icmp_ne_zext_inreg_large_constant(i64 %a) nounwind {
 ; RV64I-LABEL: icmp_ne_zext_inreg_large_constant:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, 2
+; RV64I-NEXT:    addiw a0, a0, 2
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    ret
   %1 = and i64 %a, 4294967295
diff --git a/llvm/test/CodeGen/RISCV/min-max.ll b/llvm/test/CodeGen/RISCV/min-max.ll
index acde8ad..e7f6899 100644
--- a/llvm/test/CodeGen/RISCV/min-max.ll
+++ b/llvm/test/CodeGen/RISCV/min-max.ll
@@ -5,6 +5,12 @@
 ; RUN:   FileCheck %s --check-prefixes=ZBB,RV32ZBB
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | \
 ; RUN:   FileCheck %s --check-prefixes=ZBB,RV64ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s | \
+; RUN:   FileCheck %s --check-prefixes=XQCI
+; RUN: llc < %s -mtriple=riscv32 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB
+; RUN: llc < %s -mtriple=riscv64 -mattr=+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB
 
 ; Basic tests.
 
@@ -23,6 +29,27 @@ define signext i8 @smax_i8(i8 signext %a, i8 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB0_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB0_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB0_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB0_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.smax.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -42,6 +69,27 @@ define signext i16 @smax_i16(i16 signext %a, i16 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB1_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB1_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB1_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB1_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.smax.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -61,6 +109,27 @@ define signext i32 @smax_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB2_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB2_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB2_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB2_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -112,6 +181,41 @@ define i64 @smax_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    max a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a2, a0
+; XQCI-NEXT:    slt a5, a3, a1
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a2, a0
+; RV32I-SFB-NEXT:    slt a5, a3, a1
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB3_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB3_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB3_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB3_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB3_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB3_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB3_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB3_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smax.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -131,6 +235,27 @@ define signext i8 @smin_i8(i8 signext %a, i8 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB4_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB4_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB4_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB4_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.smin.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -150,6 +275,27 @@ define signext i16 @smin_i16(i16 signext %a, i16 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB5_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB5_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB5_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB5_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.smin.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -169,6 +315,27 @@ define signext i32 @smin_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvge a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    blt a0, a1, .LBB6_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB6_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB6_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB6_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -220,6 +387,41 @@ define i64 @smin_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    min a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a0, a2
+; XQCI-NEXT:    slt a5, a1, a3
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a0, a2
+; RV32I-SFB-NEXT:    slt a5, a1, a3
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB7_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB7_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB7_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB7_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB7_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB7_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    blt a0, a1, .LBB7_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB7_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smin.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -239,6 +441,27 @@ define i8 @umax_i8(i8 zeroext %a, i8 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB8_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB8_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB8_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB8_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.umax.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -258,6 +481,27 @@ define i16 @umax_i16(i16 zeroext %a, i16 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB9_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB9_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB9_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB9_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.umax.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -277,6 +521,27 @@ define signext i32 @umax_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    maxu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a1, a0, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a1, a0, .LBB10_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB10_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB10_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB10_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -328,6 +593,41 @@ define i64 @umax_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a2, a0
+; XQCI-NEXT:    sltu a5, a3, a1
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a2, a0
+; RV32I-SFB-NEXT:    sltu a5, a3, a1
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB11_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB11_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB11_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB11_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB11_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB11_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a1, a0, .LBB11_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB11_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umax.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -347,6 +647,27 @@ define zeroext i8 @umin_i8(i8 zeroext %a, i8 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i8:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i8:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB12_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB12_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i8:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB12_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB12_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i8 @llvm.umin.i8(i8 %a, i8 %b)
   ret i8 %c
 }
@@ -366,6 +687,27 @@ define zeroext i16 @umin_i16(i16 zeroext %a, i16 zeroext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i16:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i16:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB13_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB13_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i16:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB13_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB13_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i16 @llvm.umin.i16(i16 %a, i16 %b)
   ret i16 %c
 }
@@ -385,6 +727,27 @@ define signext i32 @umin_i32(i32 signext %a, i32 signext %b) {
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    minu a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.mvgeu a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    bltu a0, a1, .LBB14_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB14_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB14_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB14_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 %a, i32 %b)
   ret i32 %c
 }
@@ -436,6 +799,41 @@ define i64 @umin_i64(i64 %a, i64 %b) {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    minu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_i64:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    sltu a4, a0, a2
+; XQCI-NEXT:    sltu a5, a1, a3
+; XQCI-NEXT:    qc.mveq a5, a1, a3, a4
+; XQCI-NEXT:    qc.mveqi a0, a5, 0, a2
+; XQCI-NEXT:    qc.mveqi a1, a5, 0, a3
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_i64:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    sltu a4, a0, a2
+; RV32I-SFB-NEXT:    sltu a5, a1, a3
+; RV32I-SFB-NEXT:    bne a1, a3, .LBB15_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a5, a4
+; RV32I-SFB-NEXT:  .LBB15_2:
+; RV32I-SFB-NEXT:    bnez a5, .LBB15_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB15_4:
+; RV32I-SFB-NEXT:    bnez a5, .LBB15_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a1, a3
+; RV32I-SFB-NEXT:  .LBB15_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_i64:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    bltu a0, a1, .LBB15_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB15_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umin.i64(i64 %a, i64 %b)
   ret i64 %c
 }
@@ -450,6 +848,18 @@ define signext i32 @smin_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: smin_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -462,6 +872,18 @@ define signext i32 @smax_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: smax_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -474,6 +896,18 @@ define signext i32 @umin_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: umin_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -486,6 +920,18 @@ define signext i32 @umax_same_op_i32(i32 signext %a) {
 ; ZBB-LABEL: umax_same_op_i32:
 ; ZBB:       # %bb.0:
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_same_op_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_same_op_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_same_op_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 %a, i32 %a)
   ret i32 %c
 }
@@ -510,6 +956,19 @@ define signext i32 @smin_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -532,6 +991,19 @@ define signext i32 @smax_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -554,6 +1026,19 @@ define signext i32 @umin_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umin_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umin_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umin_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umin.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -576,6 +1061,19 @@ define signext i32 @umax_undef_i32() {
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    li a0, 0
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_undef_i32:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_undef_i32:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_undef_i32:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a0, 0
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.umax.i32(i32 undef, i32 undef)
   ret i32 %c
 }
@@ -595,6 +1093,29 @@ define signext i32 @smax_i32_pos_constant(i32 signext %a) {
 ; ZBB-NEXT:    li a1, 10
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32_pos_constant:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.lilti a0, a0, 11, 10
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a1, 10
+; RV32I-SFB-NEXT:    blt a1, a0, .LBB24_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB24_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, 10
+; RV64I-SFB-NEXT:    blt a1, a0, .LBB24_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB24_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smax.i32(i32 %a, i32 10)
   ret i32 %c
 }
@@ -616,6 +1137,33 @@ define signext i32 @smax_i32_pos_constant_trailing_zeros(i32 signext %a) {
 ; ZBB-NEXT:    li a1, 16
 ; ZBB-NEXT:    max a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smax_i32_pos_constant_trailing_zeros:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    andi a1, a0, -8
+; XQCI-NEXT:    li a0, 16
+; XQCI-NEXT:    qc.mvlt a0, a0, a1, a1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    andi a1, a0, -8
+; RV32I-SFB-NEXT:    li a0, 16
+; RV32I-SFB-NEXT:    bge a0, a1, .LBB25_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB25_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smax_i32_pos_constant_trailing_zeros:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    andi a1, a0, -8
+; RV64I-SFB-NEXT:    li a0, 16
+; RV64I-SFB-NEXT:    bge a0, a1, .LBB25_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB25_2:
+; RV64I-SFB-NEXT:    ret
   %b = and i32 %a, -8
   %c = call i32 @llvm.smax.i32(i32 %b, i32 16)
   %d = and i32 %c, -4
@@ -635,6 +1183,29 @@ define signext i32 @smin_i32_negone(i32 signext %a) {
 ; ZBB-NEXT:    li a1, -1
 ; ZBB-NEXT:    min a0, a0, a1
 ; ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i32_negone:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.ligei a0, a0, 0, -1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i32_negone:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a1, -1
+; RV32I-SFB-NEXT:    bltz a0, .LBB26_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a1
+; RV32I-SFB-NEXT:  .LBB26_2:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i32_negone:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, -1
+; RV64I-SFB-NEXT:    bltz a0, .LBB26_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB26_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i32 @llvm.smin.i32(i32 %a, i32 -1)
   ret i32 %c
 }
@@ -672,6 +1243,34 @@ define i64 @smin_i64_negone(i64 %a) {
 ; RV64ZBB-NEXT:    li a1, -1
 ; RV64ZBB-NEXT:    min a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: smin_i64_negone:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    qc.ligei a0, a1, 0, -1
+; XQCI-NEXT:    qc.ligei a1, a1, 0, -1
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: smin_i64_negone:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a2, -1
+; RV32I-SFB-NEXT:    bltz a1, .LBB27_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB27_2:
+; RV32I-SFB-NEXT:    bltz a1, .LBB27_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a1, a2
+; RV32I-SFB-NEXT:  .LBB27_4:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: smin_i64_negone:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, -1
+; RV64I-SFB-NEXT:    bltz a0, .LBB27_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB27_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.smin.i64(i64 %a, i64 -1)
   ret i64 %c
 }
@@ -720,6 +1319,41 @@ define i64 @umax_i64_one(i64 %a, i64 %b) {
 ; RV64ZBB-NEXT:    li a1, 1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    ret
+;
+; XQCI-LABEL: umax_i64_one:
+; XQCI:       # %bb.0:
+; XQCI-NEXT:    mv a2, a1
+; XQCI-NEXT:    qc.selectinei a2, 0, a0, 1
+; XQCI-NEXT:    qc.liltui a0, a0, 2, 1
+; XQCI-NEXT:    qc.mvnei a0, a1, 0, a2
+; XQCI-NEXT:    ret
+;
+; RV32I-SFB-LABEL: umax_i64_one:
+; RV32I-SFB:       # %bb.0:
+; RV32I-SFB-NEXT:    li a2, 1
+; RV32I-SFB-NEXT:    li a3, 1
+; RV32I-SFB-NEXT:    beqz a1, .LBB28_2
+; RV32I-SFB-NEXT:  # %bb.1:
+; RV32I-SFB-NEXT:    mv a3, a0
+; RV32I-SFB-NEXT:  .LBB28_2:
+; RV32I-SFB-NEXT:    bnez a0, .LBB28_4
+; RV32I-SFB-NEXT:  # %bb.3:
+; RV32I-SFB-NEXT:    mv a0, a2
+; RV32I-SFB-NEXT:  .LBB28_4:
+; RV32I-SFB-NEXT:    beqz a1, .LBB28_6
+; RV32I-SFB-NEXT:  # %bb.5:
+; RV32I-SFB-NEXT:    mv a0, a3
+; RV32I-SFB-NEXT:  .LBB28_6:
+; RV32I-SFB-NEXT:    ret
+;
+; RV64I-SFB-LABEL: umax_i64_one:
+; RV64I-SFB:       # %bb.0:
+; RV64I-SFB-NEXT:    li a1, 1
+; RV64I-SFB-NEXT:    bnez a0, .LBB28_2
+; RV64I-SFB-NEXT:  # %bb.1:
+; RV64I-SFB-NEXT:    mv a0, a1
+; RV64I-SFB-NEXT:  .LBB28_2:
+; RV64I-SFB-NEXT:    ret
   %c = call i64 @llvm.umax.i64(i64 %a, i64 1)
   ret i64 %c
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll
new file mode 100644
index 0000000..d9a49a1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-O0-ATM-ATK.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -O0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-RV64
+
+define void @matmul() {
+; CHECK-RV64-LABEL: matmul:
+; CHECK-RV64:       # %bb.0: # %entry
+; CHECK-RV64-NEXT:    li a0, 0
+; CHECK-RV64-NEXT:    vsetvli zero, a0, 512
+; CHECK-RV64-NEXT:    sf.vsettm zero, a0
+; CHECK-RV64-NEXT:    sf.vtzero.t mt0
+; CHECK-RV64-NEXT:    ret
+entry:
+  call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 0, i64 0, i64 3, i64 1)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare void @llvm.riscv.sf.vtzero.t.i64(i64 immarg, i64, i64, i64 immarg, i64 immarg) #0
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN:     -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+  define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+    ret void
+  }
+
+  define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    ret void
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %1
+  }
+
+  define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+  entry:
+    %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+    %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+    %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+    call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+    ret <vscale x 64 x i8> %2
+  }
+
+  define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+  entry:
+    tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+    call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+    ret void
+  }
+
+  define i64 @vsettnt_max(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettm(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettn(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define i64 @single_vsettk(i64 %vl) {
+  entry:
+    %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+    ret i64 %0
+  }
+
+  define void @sf_vtzero(i64 %tm, i64 %tn) {
+  entry:
+    call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+    ret void
+  }
+
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+  declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+  declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+  declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+  declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+  declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+  declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+  declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name:            xsfmm_same_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_same_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+    PseudoRET
+...
+---
+name:            xsfmm_different_state_bf
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: vrm8 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$v8m8', virtual-reg: '%1' }
+  - { reg: '$x10', virtual-reg: '%2' }
+  - { reg: '$x11', virtual-reg: '%3' }
+  - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-LABEL: name: xsfmm_different_state_bf
+    ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %4:gprnox0 = COPY $x12
+    %3:gprnox0 = COPY $x11
+    %2:gprnox0 = COPY $x10
+    %1:vrm8 = COPY $v16m8
+    %0:vrm8 = COPY $v8m8
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+    PseudoRET
+...
+---
+name:            interleave_rvv_and_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %5:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            interleave_rvv_and_xsfmm2
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: vrm8 }
+  - { id: 5, class: vrm8 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11
+    ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+    ; CHECK: liveins: $v8m8, $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+    ; CHECK-NEXT: PseudoRET implicit $v8m8
+    %2:gpr = COPY $x11
+    %1:gprnox0 = COPY $x10
+    %0:vrm8 = COPY $v8m8
+    %3:gpr = ADDI $x0, 1
+    %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+    %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+    %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+    PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+    $v8m8 = COPY %6:vrm8
+    PseudoRET implicit $v8m8
+...
+---
+name:            consecutive_xsfmm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vrm8 }
+  - { id: 1, class: gprnox0 }
+  - { id: 2, class: gprnox0 }
+  - { id: 3, class: gprnox0 }
+  - { id: 4, class: gprnox0 }
+liveins:
+  - { reg: '$v8m8', virtual-reg: '%0' }
+  - { reg: '$x10', virtual-reg: '%1' }
+  - { reg: '$x11', virtual-reg: '%2' }
+  - { reg: '$x12', virtual-reg: '%3' }
+  - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-LABEL: name: consecutive_xsfmm
+    ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+    ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:vrm8 = COPY $v8m8
+    %1:gprnox0 = COPY $x10
+    %2:gprnox0 = COPY $x11
+    %3:gprnox0 = COPY $x12
+    %4:gprnox0 = COPY $x13
+    PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+    PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+    PseudoRET
+...
+---
+name:            vsettnt_max
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: vsettnt_max
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %3:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettm
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettm
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettn
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettn
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            single_vsettk
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    ; CHECK-LABEL: name: single_vsettk
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+    ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+    $x10 = COPY %1:gprnox0
+    PseudoRET implicit $x10
+...
+---
+name:            sf_vtzero
+alignment:       4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnox0 }
+  - { id: 1, class: gprnox0 }
+liveins:
+  - { reg: '$x10', virtual-reg: '%0' }
+  - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: sf_vtzero
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+    ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    %0:gprnox0 = COPY $x10
+    %1:gprnox0 = COPY $x11
+    PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll
new file mode 100644
index 0000000..9b9a849
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e4m3.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.e4m3.e4m3 mt0, v8, v16
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.mm.e4m3.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+   ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll
new file mode 100644
index 0000000..b63974f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e4m3_e5m2.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.e4m3.e5m2 mt0, v8, v16
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.mm.e4m3.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+   ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll
new file mode 100644
index 0000000..62d629b1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e4m3.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.e5m2.e4m3 mt0, v8, v16
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.mm.e5m2.e4m3.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+   ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll
new file mode 100644
index 0000000..7a90c97
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_e5m2_e5m2.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.e5m2.e5m2 mt0, v8, v16
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.mm.e5m2.e5m2.iXLen.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+   ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll
new file mode 100644
index 0000000..29451c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_f_f.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+xsfmm32a32f -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen, <vscale x 32 x half>, <vscale x 32 x half>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w2_f16m8(iXLen %mtd, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w2_f16m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e16, w2
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.f.f.iXLen.nxv32f16(iXLen 0, <vscale x 32 x half> %v1, <vscale x 32 x half> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 2)
+    ret void
+}
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen, <vscale x 16 x float>, <vscale x 16 x float>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w1_f32m8(iXLen %mtd, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w1_f32m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.f.f.iXLen.nxv16f32(iXLen 0, <vscale x 16 x float> %v1, <vscale x 16 x float> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1)
+    ret void
+}
+
+declare void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen, <vscale x 8 x double>, <vscale x 8 x double>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_f_f_w1_f64m8(iXLen %mtd, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_f_f_w1_f64m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.f.f mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.f.f.iXLen.nxv8f64(iXLen 0, <vscale x 8 x double> %v1, <vscale x 8 x double> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 1)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll
new file mode 100644
index 0000000..6a4b29f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_s.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_s_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_s_s_w4_i8m8_i8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.s.s mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.s.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll
new file mode 100644
index 0000000..79239b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_s_u.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_s_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_s_u_w4_i8m8_i8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.s.u mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.s.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll
new file mode 100644
index 0000000..b0d039b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_s.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_u_s_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_u_s_w4_i8m8_i8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.u.s mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.u.s.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll
new file mode 100644
index 0000000..913c277
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_mm_u_u.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfmm32a8i \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen, <vscale x 64 x i8>, <vscale x 64 x i8>, iXLen, iXLen, iXLen, iXLen)
+
+define void @test_sf_mm_u_u_w4_i8m8_i8m8(iXLen %mtd, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk) {
+; CHECK-LABEL: test_sf_mm_u_u_w4_i8m8_i8m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a1
+; CHECK-NEXT:    sf.vsettk zero, a3
+; CHECK-NEXT:    sf.mm.u.u mt0, v8, v16
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.mm.u.u.iXLen.nxv64i8.nxv64i8(iXLen 0, <vscale x 64 x i8> %v1, <vscale x 64 x i8> %v2, iXLen %tm, iXLen %tn, iXLen %tk, iXLen 4)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll
new file mode 100644
index 0000000..8048dec
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte16.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte16.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte16(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e16, w1
+; CHECK-NEXT:    sf.vlte16 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vlte16.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll
new file mode 100644
index 0000000..a526dc8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte32.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte32.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte32(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT:    sf.vlte32 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vlte32.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll
new file mode 100644
index 0000000..ed0c48a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte64.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte64.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte64(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT:    sf.vlte64 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vlte64.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll
new file mode 100644
index 0000000..67b3ed2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vlte8.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vlte8.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vlte8(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vlte8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w1
+; CHECK-NEXT:    sf.vlte8 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vlte8.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll
new file mode 100644
index 0000000..4da37fa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettk.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettk(iXLen %tk) {
+; CHECK-LABEL: test_sf_vsettk:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a1, zero, e16, w2
+; CHECK-NEXT:    sf.vsettk a0, a0
+; CHECK-NEXT:    ret
+  entry:
+    %0 = call iXLen @llvm.riscv.sf.vsettk.iXLen(iXLen %tk, iXLen 1, iXLen 2)
+    ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll
new file mode 100644
index 0000000..143c26c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettm.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettm(iXLen %tm) {
+; CHECK-LABEL: test_sf_vsettm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a1, zero, e8, w4
+; CHECK-NEXT:    sf.vsettm a0, a0
+; CHECK-NEXT:    ret
+  entry:
+    %0 = call iXLen @llvm.riscv.sf.vsettm.iXLen(iXLen %tm, iXLen 0, iXLen 3)
+    ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll
new file mode 100644
index 0000000..48fa1bc8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vsettnt.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen, iXLen, iXLen)
+
+define iXLen @test_sf_vsettnt_e8w1(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e8, w1
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 1)
+   ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e8w2(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e8, w2
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 2)
+   ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e8w4(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e8w4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e8, w4
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 0, iXLen 3)
+   ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w1(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e16, w1
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 1)
+   ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w2(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e16, w2
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 2)
+   ret iXLen %0
+}
+
+define iXLen @test_sf_vsettnt_e16w4(iXLen %tn) {
+; CHECK-LABEL: test_sf_vsettnt_e16w4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt a0, a0, e16, w4
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call iXLen @llvm.riscv.sf.vsettnt.iXLen(iXLen %tn, iXLen 1, iXLen 3)
+   ret iXLen %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll
new file mode 100644
index 0000000..7a76151
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste16.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste16.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste16(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e16, w1
+; CHECK-NEXT:    sf.vste16 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vste16.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll
new file mode 100644
index 0000000..8ff6e6a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste32.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste32.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste32(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e32, w1
+; CHECK-NEXT:    sf.vste32 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vste32.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll
new file mode 100644
index 0000000..53990e4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste64.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste64.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste64(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e64, w1
+; CHECK-NEXT:    sf.vste64 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vste64.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll
new file mode 100644
index 0000000..09b7259
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vste8.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vste8.iXLen(iXLen, ptr, iXLen)
+
+define dso_local void @test_sf_vste8(iXLen %tss, ptr %base, iXLen %vl) {
+; CHECK-LABEL: test_sf_vste8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a2, e8, w1
+; CHECK-NEXT:    sf.vste8 a0, (a1)
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vste8.iXLen(iXLen %tss, ptr %base, iXLen %vl)
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll
new file mode 100644
index 0000000..394eb60
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtdiscard.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtdiscard()
+
+define dso_local void @test_sf_vtdiscard() {
+; CHECK-LABEL: test_sf_vtdiscard:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vtdiscard
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vtdiscard()
+    ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll
new file mode 100644
index 0000000..66c9d26
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_t_v.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen, <vscale x 32 x bfloat>, iXLen)
+
+define void @test_sf_vtmv_t_v_bf16m8(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_bf16m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.iXLen(iXLen %tss, <vscale x 32 x bfloat> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen, <vscale x 32 x half>, iXLen)
+
+define void @test_sf_vtmv_t_v_f16(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv32f16.iXLen(iXLen %tss, <vscale x 32 x half> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen, <vscale x 16 x float>, iXLen)
+
+define void @test_sf_vtmv_t_v_f32(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv16f32.iXLen(iXLen %tss, <vscale x 16 x float> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen, <vscale x 8 x double>, iXLen)
+
+define void @test_sf_vtmv_t_v_f64(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv8f64.iXLen(iXLen %tss, <vscale x 8 x double> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen, <vscale x 64 x i8>, iXLen)
+
+define void @test_sf_vtmv_t_v_i8(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e8, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.iXLen(iXLen %tss, <vscale x 64 x i8> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen, <vscale x 32 x i16>, iXLen)
+
+define void @test_sf_vtmv_t_v_i16(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.iXLen(iXLen %tss, <vscale x 32 x i16> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen, <vscale x 16 x i32>, iXLen)
+
+define void @test_sf_vtmv_t_v_i32(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.iXLen(iXLen %tss, <vscale x 16 x i32> %src, iXLen %vl)
+   ret void
+}
+
+declare void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen, <vscale x 8 x i64>, iXLen)
+
+define void @test_sf_vtmv_t_v_i64(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_t_v_i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT:    sf.vtmv.t.v a0, v8
+; CHECK-NEXT:    ret
+ entry:
+   call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.iXLen(iXLen %tss, <vscale x 8 x i64> %src, iXLen %vl)
+   ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll
new file mode 100644
index 0000000..0dcc2ab
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtmv_v_t.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x bfloat> @test_sf_vtmv_v_t_bf16m8(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_bf16m8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 32 x bfloat> %0
+}
+
+declare <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x half> @test_sf_vtmv_v_t_f16(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 32 x half> %0
+}
+
+declare <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen, iXLen)
+
+define <vscale x 16 x float> @test_sf_vtmv_v_t_f32(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 16 x float> %0
+}
+
+declare <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen, iXLen)
+
+define <vscale x 8 x double> @test_sf_vtmv_v_t_f64(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 8 x double> %0
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen, iXLen)
+
+define <vscale x 64 x i8> @test_sf_vtmv_v_t_i8(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e8, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 64 x i8> %0
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen, iXLen)
+
+define <vscale x 32 x i16> @test_sf_vtmv_v_t_i16(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e16, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 32 x i16> %0
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen, iXLen)
+
+define <vscale x 16 x i32> @test_sf_vtmv_v_t_i32(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e32, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 16 x i32> %0
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen, iXLen)
+
+define <vscale x 8 x i64> @test_sf_vtmv_v_t_i64(iXLen %tss, iXLen %vl) {
+; CHECK-LABEL: test_sf_vtmv_v_t_i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e64, w1
+; CHECK-NEXT:    sf.vtmv.v.t v8, a0
+; CHECK-NEXT:    ret
+ entry:
+   %0 = call <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.iXLen(iXLen %tss, iXLen %vl)
+   ret <vscale x 8 x i64> %0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll
new file mode 100644
index 0000000..bbccb02
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive_sf_vtzero_t.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN:   -mattr=+zvfh -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+experimental-zvfbfmin -mattr=+xsfmmbase \
+; RUN:   -mattr=+xsfmm32a -mattr=+xsfmm32a8f -mattr=+xsfmm32a4i -mattr=+xsfmm64a64f \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+declare void @llvm.riscv.sf.vtzero.t.iXLen(iXLen, iXLen, iXLen, iXLen, iXLen)
+define void @test_sf_vtzero_t(iXLen %tm, iXLen %tn) {
+; CHECK-LABEL: test_sf_vtzero_t:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sf.vsettnt zero, a1, e8, w4
+; CHECK-NEXT:    sf.vsettm zero, a0
+; CHECK-NEXT:    sf.vtzero.t mt0
+; CHECK-NEXT:    ret
+  entry:
+    call void @llvm.riscv.sf.vtzero.t.iXLen(iXLen 0, iXLen %tm, iXLen %tn, iXLen 3, iXLen 4)
+    ret void
+}
+
diff --git a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
index 2f03ff9..318268a 100644
--- a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
+++ b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
@@ -15,8 +15,7 @@ define i32 @from_cmpeq(i32 %xx, i32 %y) {
 ;
 ; RV64I-LABEL: from_cmpeq:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, -9
+; RV64I-NEXT:    addiw a0, a0, -9
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -39,8 +38,7 @@ define i32 @from_cmpeq_fail_bad_andmask(i32 %xx, i32 %y) {
 ;
 ; RV64I-LABEL: from_cmpeq_fail_bad_andmask:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, -9
+; RV64I-NEXT:    addiw a0, a0, -9
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll
index fabb573..4e14893 100644
--- a/llvm/test/CodeGen/RISCV/setcc-logic.ll
+++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll
@@ -104,9 +104,8 @@ define i1 @and_icmps_const_not1bit_diff(i32 %x) nounwind {
 ;
 ; RV64I-LABEL: and_icmps_const_not1bit_diff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a1, a0, -44
-; RV64I-NEXT:    addi a0, a0, -92
+; RV64I-NEXT:    addiw a1, a0, -44
+; RV64I-NEXT:    addiw a0, a0, -92
 ; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index bdbe4ed..07bfbe6 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -674,8 +674,7 @@ define i32 @sext_of_not_cmp_i32(i32 %x) {
 ;
 ; RV64-LABEL: sext_of_not_cmp_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    addiw a0, a0, -7
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    ret
@@ -718,8 +717,7 @@ define i32 @dec_of_zexted_cmp_i32(i32 %x) {
 ;
 ; RV64-LABEL: dec_of_zexted_cmp_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    addiw a0, a0, -7
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 2751332c..bf6802d 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1047,8 +1047,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV64-LABEL: usubo.i32.constant.lhs:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a2, -2
-; RV64-NEXT:    subw a2, a2, a0
-; RV64-NEXT:    addi a0, a2, 1
+; RV64-NEXT:    sub a2, a2, a0
+; RV64-NEXT:    addiw a0, a2, 1
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    sw a2, 0(a1)
 ; RV64-NEXT:    ret
@@ -1065,8 +1065,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV64ZBA-LABEL: usubo.i32.constant.lhs:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    li a2, -2
-; RV64ZBA-NEXT:    subw a2, a2, a0
-; RV64ZBA-NEXT:    addi a0, a2, 1
+; RV64ZBA-NEXT:    sub a2, a2, a0
+; RV64ZBA-NEXT:    addiw a0, a2, 1
 ; RV64ZBA-NEXT:    seqz a0, a0
 ; RV64ZBA-NEXT:    sw a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
@@ -1083,8 +1083,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
 ; RV64ZICOND-LABEL: usubo.i32.constant.lhs:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    li a2, -2
-; RV64ZICOND-NEXT:    subw a2, a2, a0
-; RV64ZICOND-NEXT:    addi a0, a2, 1
+; RV64ZICOND-NEXT:    sub a2, a2, a0
+; RV64ZICOND-NEXT:    addiw a0, a2, 1
 ; RV64ZICOND-NEXT:    seqz a0, a0
 ; RV64ZICOND-NEXT:    sw a2, 0(a1)
 ; RV64ZICOND-NEXT:    ret
diff --git a/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
new file mode 100644
index 0000000..ddc2585
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-compiler-used.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; Verify that llvm.compiler.used is not lowered.
+; CHECK: OpName %{{[0-9]+}} "unused"
+; CHECK-NOT: OpName %{{[0-9]+}} "llvm.compiler.used"
+
+; Check that the type of llvm.compiler.used is not emitted too.
+; CHECK-NOT: OpTypeArray
+
+@unused = private addrspace(3) global i32 0
+@llvm.compiler.used = appending addrspace(2) global [1 x ptr addrspace (4)] [ptr addrspace(4) addrspacecast (ptr addrspace(3) @unused to ptr addrspace(4))]
+
+define spir_func void @foo() {
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
index c6ee804..07fbed9 100644
--- a/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/htm-intrinsics.ll
@@ -90,7 +90,7 @@ define i32 @test_tbegin_nofloat4(i32 %pad, ptr %ptr) {
 ; CHECK: tbegin 0, 65292
 ; CHECK: ipm %r2
 ; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
 ; CHECK: mvhi 0(%r3), 0
 ; CHECK: br %r14
   %res = call i32 @llvm.s390.tbegin.nofloat(ptr null, i32 65292)
@@ -219,7 +219,7 @@ define i32 @test_tend2(i32 %pad, ptr %ptr) {
 ; CHECK: tend
 ; CHECK: ipm %r2
 ; CHECK: srl %r2, 28
-; CHECK: ciblh %r2, 2, 0(%r14)
+; CHECK: bnhr %r14
 ; CHECK: mvhi 0(%r3), 0
 ; CHECK: br %r14
   %res = call i32 @llvm.s390.tend()
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
new file mode 100644
index 0000000..6b8746e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-01.ll
@@ -0,0 +1,738 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining br_ccmask for flag output operand, and
+; optimizing ipm sequence using conditional branches.
+
+declare void @dummy()
+
+; Check a case where the cc is used as an integer.
+; Just (srl (ipm)) sequence without optimization.
+define i32 @test(ptr %a) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    srl %r2, 28
+; CHECK-NEXT:    br %r14
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  ret i32 %cc
+}
+
+; Test-1(f1_0_*). Test all 14 valid combinations, where cc is being used for
+; branching.
+
+; Check (cc == 0).
+define void @f1_0_eq_0(ptr %a) {
+; CHECK-LABEL: f1_0_eq_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jge dummy@PLT
+; CHECK-NEXT:  .LBB1_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 0).
+define void @f1_0_ne_0(ptr %a) {
+; CHECK-LABEL: f1_0_ne_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgne dummy@PLT
+; CHECK-NEXT:  .LBB2_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ugt i32 %cc, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1).
+define void @f1_0_eq_1(ptr %a) {
+; CHECK-LABEL: f1_0_eq_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgl dummy@PLT
+; CHECK-NEXT:  .LBB3_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 1).
+define void @f1_0_ne_1(ptr %a) {
+; CHECK-LABEL: f1_0_ne_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB4_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ne i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 2).
+define void @f1_0_eq_2(ptr %a) {
+; CHECK-LABEL: f1_0_eq_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgh dummy@PLT
+; CHECK-NEXT:  .LBB5_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 2).
+define void @f1_0_ne_2(ptr %a) {
+; CHECK-LABEL: f1_0_ne_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB6_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ne i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 3).
+define void @f1_0_eq_3(ptr %a) {
+; CHECK-LABEL: f1_0_eq_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgo dummy@PLT
+; CHECK-NEXT:  .LBB7_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp eq i32 %cc, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc != 3).
+define void @f1_0_ne_3(ptr %a) {
+; CHECK-LABEL: f1_0_ne_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgno dummy@PLT
+; CHECK-NEXT:  .LBB8_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ult i32 %cc, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|1).
+define void @f1_0_01(ptr %a) {
+; CHECK-LABEL: f1_0_01:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB9_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ult i32 %cc, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|2).
+define void @f1_0_02(ptr %a) {
+; CHECK-LABEL: f1_0_02:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jghe dummy@PLT
+; CHECK-NEXT:  .LBB10_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 0|3).
+define void @f1_0_03(ptr %a) {
+; CHECK-LABEL: f1_0_03:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnlh dummy@PLT
+; CHECK-NEXT:  .LBB11_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp0 = icmp ne i32 %cc, 0
+  %cmp3 = icmp ne i32 %cc, 3
+  %cmp.inv = and i1 %cmp0, %cmp3
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1|2).
+define void @f1_0_12(ptr %a) {
+; CHECK-LABEL: f1_0_12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB12_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq1 = icmp eq i32 %cc, 1
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %cmp = or i1 %cmpeq1, %cmpeq2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 1|3).
+define void @f1_0_13(ptr %a) {
+; CHECK-LABEL: f1_0_13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnhe dummy@PLT
+; CHECK-NEXT:  .LBB13_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq1 = icmp eq i32 %cc, 1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp = or i1 %cmpeq1, %cmpeq3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check (cc == 2|3).
+define void @f1_0_23(ptr %a) {
+; CHECK-LABEL: f1_0_23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnle dummy@PLT
+; CHECK-NEXT:  .LBB14_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmp = icmp ugt i32 %cc, 1
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Test-2(f1_1_*/f1_2_*/fl_3_*/f1_4_*).
+; Test Mixed patterns involving Binary Ops.
+
+; Check 'add' for (cc != 0).
+define void @f1_1_1(ptr %a) {
+; CHECK-LABEL: f1_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgne dummy@PLT
+; CHECK-NEXT:  .LBB15_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cmp = icmp ult i32 %add, 3
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_2(ptr %a) {
+; CHECK-LABEL: f1_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB16_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cmp = icmp ult i32 %add, 2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'add' for (cc == 1|2).
+define void @f1_1_3(ptr %a) {
+; CHECK-LABEL: f1_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jglh dummy@PLT
+; CHECK-NEXT:  .LBB17_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cmp.inv = icmp ult i32 %add, -2
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define void @f1_2_1(ptr %a) {
+; CHECK-LABEL: f1_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB18_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp.inv = and i1 %cmpne3, %cmpne0
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define void @f1_2_2(ptr %a) {
+; CHECK-LABEL: f1_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB19_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %and.cond.inv = and i1 %ugt1, %cmpne3
+  br i1 %and.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define void @f1_2_3(ptr %a) {
+; CHECK-LABEL: f1_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jghe dummy@PLT
+; CHECK-NEXT:  .LBB20_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define void @f1_2_4(ptr %a) {
+; CHECK-LABEL: f1_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnhe dummy@PLT
+; CHECK-NEXT:  .LBB21_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define void @f1_2_5(ptr %a) {
+; CHECK-LABEL: f1_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB22_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp = xor i1 %cmpne3, %trunc
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define void @f1_3_1(ptr %a) {
+; CHECK-LABEL: f1_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB23_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cmp.inv = xor i1 %cmpne3, %xor
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define void @f1_3_2(ptr %a) {
+; CHECK-LABEL: f1_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnl dummy@PLT
+; CHECK-NEXT:  .LBB24_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp.inv = xor i1 %cmpeq3, %trunc
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define void @f1_3_3(ptr %a) {
+; CHECK-LABEL: f1_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgnh dummy@PLT
+; CHECK-NEXT:  .LBB25_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cmp.cond.inv = xor i1 %cmpne0, %trunc
+  br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' with both operands are select_ccmask one with TM and other with
+; ICMP(cc == 1).
+define void @f1_4_1(ptr %a) {
+; CHECK-LABEL: f1_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgl dummy@PLT
+; CHECK-NEXT:  .LBB26_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cmp.cond.inv = or i1 %cmpeq3, %cmpeq0
+  br i1 %cmp.cond.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_2(ptr %a) {
+; CHECK-LABEL: f1_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB27_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cmp.inv = icmp samesign ugt i32 %or, -3
+  br i1 %cmp.inv, label %exit, label %branch
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
+; Check 'or' for (cc == 0|1).
+define void @f1_4_3(ptr %a) {
+; CHECK-LABEL: f1_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    jgle dummy@PLT
+; CHECK-NEXT:  .LBB28_1: # %exit
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cmp = icmp samesign ult i32 %or, -2
+  br i1 %cmp, label %branch, label %exit
+branch:
+  tail call void @dummy()
+  br label %exit
+exit:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
new file mode 100644
index 0000000..b9b9a4b
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-flag-output-02.ll
@@ -0,0 +1,1665 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -O2 | FileCheck %s
+; Test implementation of combining select_ccmask for flag output operand and
+; optimizing ipm sequence using conditional branches.
+
+; Test-1(f2_0_*): Both TrueVal and FalseVal non-const(14-valid CCMask).
+
+; Check (cc == 0).
+define i64 @f2_0_eq_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ber %r14
+; CHECK-NEXT:  .LBB0_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 0).
+define i64 @f2_0_ne_0(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB1_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ugt i32 %cc, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 1).
+define i64 @f2_0_eq_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB2_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 1).
+define i64 @f2_0_ne_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB3_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ne i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 2).
+define i64 @f2_0_eq_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB4_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 2).
+define i64 @f2_0_ne_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnhr %r14
+; CHECK-NEXT:  .LBB5_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ne i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 3).
+define i64 @f2_0_eq_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_eq_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bor %r14
+; CHECK-NEXT:  .LBB6_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp eq i32 %cc, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc != 3).
+define i64 @f2_0_ne_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_ne_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnor %r14
+; CHECK-NEXT:  .LBB7_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ult i32 %cc, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|1).
+define i64 @f2_0_01(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_01:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB8_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ult i32 %cc, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|2).
+define i64 @f2_0_02(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_02:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB9_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check (cc == 0|3).
+define i64 @f2_0_03(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_03:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB10_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp0 = icmp ne i32 %cc, 0
+  %cmp3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmp0, %cmp3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 1|2).
+define i64 @f2_0_12(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB11_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 1|3).
+define i64 @f2_0_13(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_0_13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB12_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check (cc == 2|3).
+define i64 @f2_0_23(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_0_23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB13_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %cond = icmp ugt i32 %cc, 1
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Test-2(f2_1_*/f2_2_*/f2_3_*/f2_4_*).
+; Both TrueVal and FalseVal are non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f2_1_1(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB14_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_2(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB15_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f2_1_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB16_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f2_2_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB17_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f2_2_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB18_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f2_2_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB19_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f2_2_4(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB20_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f2_2_5(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB21_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f2_3_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB22_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f2_3_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB23_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f2_3_3(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB24_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f2_4_1(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB25_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_2(i64 %y, i64 %x, ptr %a) {
+; CHECK-LABEL: f2_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB26_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 %y, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f2_4_3(i64 %x, i64 %y, ptr %a) {
+; CHECK-LABEL: f2_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r4), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB27_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+; Test-3(f3_1_*/f3_2_*/f3_3_*/f3_4_*).
+; TrueVal is non-const and FalseVal is const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f3_1_1(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB28_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_2(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB29_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f3_1_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB30_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f3_2_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB31_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f3_2_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB32_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f3_2_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB33_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f3_2_4(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB34_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f3_2_5(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB35_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f3_3_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB36_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f3_3_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB37_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f3_3_3(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB38_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f3_4_1(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB39_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_2(ptr %a, i64 %x) {
+; CHECK-LABEL: f3_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB40_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 5, i64 %x
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f3_4_3(i64 %x, ptr %a) {
+; CHECK-LABEL: f3_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB41_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 %x, i64 5
+  ret i64 %res
+}
+
+
+; Test-4(f4_1_*/f4_2_*/f4_3_*/f4_4_*).
+; TrueVal is const and FalseVal is non-const with mixed patterns involving
+; Binary Ops.
+
+; Check 'add' for (cc != 0).
+define i64 @f4_1_1(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB42_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_2(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB43_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f4_1_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB44_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f4_2_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB45_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f4_2_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB46_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f4_2_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB47_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f4_2_4(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB48_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f4_2_5(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB49_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f4_3_1(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB50_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f4_3_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB51_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f4_3_3(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB52_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f4_4_1(i64 %y,ptr %a) {
+; CHECK-LABEL: f4_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB53_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_2(i64 %y, ptr %a) {
+; CHECK-LABEL: f4_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r3), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB54_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 %y, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f4_4_3(ptr %a, i64 %y) {
+; CHECK-LABEL: f4_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB55_1: # %entry
+; CHECK-NEXT:    lgr %r2, %r3
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 15, i64 %y
+  ret i64 %res
+}
+
+; Test-5(f5_1_*/f5_2_*/f5_3_*/f5_4_*).
+; Both TrueVal and FalseVal are const with mixed patterns involving
+; Binary Ops.
+
+
+; Check 'add' for (cc != 0).
+define i64 @f5_1_1(ptr %a) {
+; CHECK-LABEL: f5_1_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bner %r14
+; CHECK-NEXT:  .LBB56_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 3
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_2(ptr %a) {
+; CHECK-LABEL: f5_1_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    blhr %r14
+; CHECK-NEXT:  .LBB57_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -1
+  %cond = icmp ult i32 %add, 2
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check 'add' for (cc == 1|2).
+define i64 @f5_1_3(ptr %a) {
+; CHECK-LABEL: f5_1_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlhr %r14
+; CHECK-NEXT:  .LBB58_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %add = add nsw i32 %cc, -3
+  %cond.inv = icmp ult i32 %add, -2
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with one operand cc and other select_ccmask(cc !=1).
+define i64 @f5_2_1(ptr %a) {
+; CHECK-LABEL: f5_2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB59_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpne0 = icmp ne i32 %andcc, 0
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %cmpne3, %cmpne0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and' with both operands select_ccmask(cc != 2).
+define i64 @f5_2_2(ptr %a) {
+; CHECK-LABEL: f5_2_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB60_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %ugt1 = icmp samesign ugt i32 %cc, 1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = and i1 %ugt1, %cmpne3
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'and/tm' for (cc == 0|2).
+define i64 @f5_2_3(ptr %a) {
+; CHECK-LABEL: f5_2_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB61_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond = icmp eq i32 %and, 0
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Check  'and/tm' for (cc == 1|3).
+define i64 @f5_2_4(ptr %a) {
+; CHECK-LABEL: f5_2_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB62_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %and = and i32 %cc, 1
+  %cond.inv = icmp eq i32 %and, 0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'icmp' with one operand 'and' and other 'select_ccmask'(cc != 1).
+define i64 @f5_2_5(ptr %a) {
+; CHECK-LABEL: f5_2_5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB63_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond = xor i1 %cmpne3, %trunc
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+
+; Check nested 'xor' cc with select_ccmask(cc != 1).
+define i64 @f5_3_1(ptr %a) {
+; CHECK-LABEL: f5_3_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB64_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %cmpeq0 = icmp eq i32 %cc, 0
+  %cmpeq2 = icmp eq i32 %cc, 2
+  %xor = xor i1 %cmpeq0, %cmpeq2
+  %cmpne3 = icmp ne i32 %cc, 3
+  %cond.inv = xor i1 %cmpne3, %xor
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=1).
+define i64 @f5_3_2(ptr %a) {
+; CHECK-LABEL: f5_3_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    blr %r14
+; CHECK-NEXT:  .LBB65_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = xor i1 %cmpeq3, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check branching on 'tm' and 'xor' with one operand cc and the other
+; select_ccmask(cc !=2).
+define i64 @f5_3_3(ptr %a) {
+; CHECK-LABEL: f5_3_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bhr %r14
+; CHECK-NEXT:  .LBB66_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %trunc =  trunc i32 %cc to i1
+  %cmpne0 = icmp ne i32 %cc, 0
+  %cond.inv = xor i1 %cmpne0, %trunc
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' with both operands select_ccmask with TM and ICMP(cc == 1).
+define i64 @f5_4_1(ptr %a) {
+; CHECK-LABEL: f5_4_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnlr %r14
+; CHECK-NEXT:  .LBB67_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %andcc =  and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %cond.inv = or i1 %cmpeq3, %cmpeq0
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_2(ptr %a) {
+; CHECK-LABEL: f5_4_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    bnler %r14
+; CHECK-NEXT:  .LBB68_1: # %entry
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond.inv = icmp samesign ugt i32 %or, -3
+  %res = select i1 %cond.inv, i64 5, i64 15
+  ret i64 %res
+}
+
+; Check 'or' for (cc == 0|1).
+define i64 @f5_4_3(ptr %a) {
+; CHECK-LABEL: f5_4_3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bler %r14
+; CHECK-NEXT:  .LBB69_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %tmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %tmp)
+  %or = or disjoint i32 %cc, -4
+  %cond = icmp samesign ult i32 %or, -2
+  %res = select i1 %cond, i64 15, i64 5
+  ret i64 %res
+}
+
+; Nested select_ccmask with TrueVal and FalseVal swapped with each other.
+define i64 @f6_1(ptr %a) {
+; CHECK-LABEL: f6_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    alsi 0(%r2), -1
+; CHECK-EMPTY:
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    lghi %r2, 15
+; CHECK-NEXT:    bher %r14
+; CHECK-NEXT:  .LBB70_1: # %entry
+; CHECK-NEXT:    lghi %r2, 5
+; CHECK-NEXT:    br %r14
+entry:
+  %cc = tail call i32 asm sideeffect "alsi $1,-1\0A", "={@cc},=*QS,*QS,~{memory}"(ptr  elementtype(i32) %a,  ptr elementtype(i32) %a)
+  %cmp = icmp ult i32 %cc, 4
+  tail call void @llvm.assume(i1 %cmp)
+  %andcc = and i32 %cc, 1
+  %cmpeq0 = icmp eq i32 %andcc, 0
+  %cmpeq3 = icmp eq i32 %cc, 3
+  %select = select i1 %cmpeq3, i64 5, i64 15
+  %res = select i1 %cmpeq0, i64 %select, i64 5
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll
index 1e2b332..47c7918 100644
--- a/llvm/test/CodeGen/Thumb2/carry.ll
+++ b/llvm/test/CodeGen/Thumb2/carry.ll
@@ -1,35 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 define i64 @f1(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
-	%tmp = sub i64 %a, %b
-	ret i64 %tmp
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, r0, r2
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp = sub i64 %a, %b
+  ret i64 %tmp
 }
 
 define i64 @f2(i64 %a, i64 %b) {
-entry:
 ; CHECK-LABEL: f2:
-; CHECK: lsls  r1, r1, #1
-; CHECK: orr.w r1, r1, r0, lsr #31
-; CHECK: rsbs  r0, r2, r0, lsl #1
-; CHECK: sbcs  r1, r3
-        %tmp1 = shl i64 %a, 1
-	%tmp2 = sub i64 %tmp1, %b
-	ret i64 %tmp2
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    orr.w r1, r1, r0, lsr #31
+; CHECK-NEXT:    rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT:    sbcs r1, r3
+; CHECK-NEXT:    bx lr
+entry:
+  %tmp1 = shl i64 %a, 1
+  %tmp2 = sub i64 %tmp1, %b
+  ret i64 %tmp2
 }
 
 ; rdar://12559385
 define i64 @f3(i32 %vi) {
-entry:
 ; CHECK-LABEL: f3:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbcs r{{[0-9]+}}, [[REG]]
-    %v0 = zext i32 %vi to i64
-    %v1 = xor i64 %v0, -155057456198619
-    %v4 = add i64 %v1, 155057456198619
-    %v5 = add i64 %v4, %v1
-    ret i64 %v5
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movw r1, #19493
+; CHECK-NEXT:    movt r1, #57191
+; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    movw r2, #29433
+; CHECK-NEXT:    movw r3, #46043
+; CHECK-NEXT:    movw r1, #36102
+; CHECK-NEXT:    movt r2, #65535
+; CHECK-NEXT:    adds r0, r0, r0
+; CHECK-NEXT:    movt r3, #8344
+; CHECK-NEXT:    sbcs r2, r1
+; CHECK-NEXT:    adds r0, r0, r3
+; CHECK-NEXT:    adcs r1, r2
+; CHECK-NEXT:    bx lr
+entry:
+  %v0 = zext i32 %vi to i64
+  %v1 = xor i64 %v0, -155057456198619
+  %v4 = add i64 %v1, 155057456198619
+  %v5 = add i64 %v4, %v1
+  ret i64 %v5
 }
diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
new file mode 100644
index 0000000..abbd953
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=mvp -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+
+; This test ensures that loads and stores generated for small memcpy et al use
+; constant offset folding.
+
+
+target triple = "wasm32-unknown-unknown"
+
+define void @call_memset(ptr) #0 {
+; CHECK-LABEL: call_memset:
+; CHECK:         .functype call_memset (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.const $push0=, 0
+; CHECK-NEXT:    i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT:    i64.const $push1=, 0
+; CHECK-NEXT:    i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT:    # fallthrough-return
+    call void @llvm.memset.p0.i32(ptr align 1 %0, i8 0, i32 16, i1 false)
+    ret void
+}
+
+define void @call_memcpy(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memcpy:
+; CHECK:         .functype call_memcpy (i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT:    i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT:    i64.load $push1=, 0($1):p2align=0
+; CHECK-NEXT:    i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT:    # fallthrough-return
+    call void @llvm.memcpy.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+    ret void
+}
+
+
+define void @call_memmove(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memmove:
+; CHECK:         .functype call_memmove (i32, i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.load $2=, 0($1):p2align=0
+; CHECK-NEXT:    i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT:    i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT:    i64.store 0($0):p2align=0, $2
+; CHECK-NEXT:    # fallthrough-return
+    call void @llvm.memmove.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+    ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK:         .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK:         .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle2, %shuffle1
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK:         .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.dot_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %mul = mul <8 x i32> %sext, %sext
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK:         .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_u
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_u
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %zext1 = zext <8 x i16> %a to <8 x i32>
+  %zext2 = zext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %zext1, %zext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK:         .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_low_i16x8_s
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32x4.extmul_high_i16x8_s
+; CHECK-NEXT:    i32x4.add
+; CHECK-NEXT:    # fallthrough-return
+  %sext1 = sext <8 x i16> %a to <8 x i32>
+  %sext2 = sext <8 x i16> %b to <8 x i32>
+  %mul = mul <8 x i32> %sext1, %sext2
+  %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %res = add <4 x i32> %shuffle1, %shuffle2
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
 
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
 ; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+fp16,+simd128,              | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers  -mattr=+simd128                     | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers                                      | FileCheck %s --check-prefix=NOSIMD
 
 target triple = "wasm32"
 
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD:         .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %mul = fmul contract half %b, %a
+  %add = fadd contract half %mul, %c
+  ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD:         .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT:    call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT:    call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT:    call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT:    f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT:    call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT:    call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT:    f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT:    return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    call $push0=, __truncsfhf2, $1
+; STRICT-NEXT:    call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT:    call $push2=, __truncsfhf2, $0
+; STRICT-NEXT:    call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT:    f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT:    call $push5=, __truncsfhf2, $2
+; STRICT-NEXT:    call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT:    f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT:    return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD:         .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    return $pop7
+  %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+  ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $1, $0
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $1, $0
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %mul = fmul contract float %b, %a
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD:         .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32.mul $push0=, $0, $1
+; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32.mul $push0=, $0, $1
+; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD:         .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+  ret float %fma
+}
+
 define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_f64:
 ; RELAXED:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:    f64.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD:         .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $1, $0
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
   %mul = fmul contract double %b, %a
   %add = fadd contract double %mul, %c
   ret double %add
 }
 
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD:         .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64.mul $push0=, $0, $1
+; RELAXED-NEXT:    f64.add $push1=, $pop0, $2
+; RELAXED-NEXT:    return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64.mul $push0=, $0, $1
+; STRICT-NEXT:    f64.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD:         .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $0, $1
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT:    return $pop1
+  %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+  ret double %fma
+}
+
 define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_4xf32:
 ; RELAXED:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16:         .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD:         .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul contract <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
-
 define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_8xf16:
 ; RELAXED:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_8xf16:
 ; STRICT:         .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD:         .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x half> %b, %a
   %add = fadd contract <8 x half> %mul, %c
   ret <8 x half> %add
 }
 
-
 define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fadd_fmul_4xf32:
 ; RELAXED:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
 ; STRICT-NEXT:    f32x4.mul $push0=, $1, $0
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16:         .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD:         .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $4
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $3
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $2
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $1
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %mul = fmul <4 x float> %b, %a
   %add = fadd contract <4 x float> %mul, %c
   ret <4 x float> %add
 }
 
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT:         .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD:         .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT:    call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT:    call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT:    i32.store16 14($0), $pop8
+; NOFP16-NEXT:    call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT:    call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT:    call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT:    i32.store16 12($0), $pop17
+; NOFP16-NEXT:    call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT:    call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT:    call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT:    i32.store16 10($0), $pop26
+; NOFP16-NEXT:    call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT:    call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT:    call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT:    i32.store16 8($0), $pop35
+; NOFP16-NEXT:    call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT:    call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT:    call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT:    i32.store16 6($0), $pop44
+; NOFP16-NEXT:    call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT:    call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT:    call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT:    i32.store16 4($0), $pop53
+; NOFP16-NEXT:    call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT:    call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT:    call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT:    i32.store16 2($0), $pop62
+; NOFP16-NEXT:    call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT:    call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT:    call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT:    i32.store16 0($0), $pop71
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD:         .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT:    call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT:    call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT:    call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT:    f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT:    call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT:    call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT:    f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT:    call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT:    i32.store16 14($0), $pop8
+; NOSIMD-NEXT:    call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT:    call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT:    call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT:    call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT:    f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT:    call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT:    call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT:    f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT:    call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT:    i32.store16 12($0), $pop17
+; NOSIMD-NEXT:    call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT:    call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT:    call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT:    call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT:    f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT:    call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT:    call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT:    f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT:    call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT:    i32.store16 10($0), $pop26
+; NOSIMD-NEXT:    call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT:    call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT:    call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT:    call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT:    f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT:    call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT:    call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT:    f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT:    call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT:    i32.store16 8($0), $pop35
+; NOSIMD-NEXT:    call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT:    call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT:    call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT:    call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT:    f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT:    call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT:    call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT:    f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT:    call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT:    i32.store16 6($0), $pop44
+; NOSIMD-NEXT:    call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT:    call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT:    call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT:    call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT:    f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT:    call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT:    call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT:    f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT:    call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT:    i32.store16 4($0), $pop53
+; NOSIMD-NEXT:    call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT:    call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT:    call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT:    call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT:    f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT:    call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT:    call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT:    f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT:    call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT:    i32.store16 2($0), $pop62
+; NOSIMD-NEXT:    call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT:    call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT:    call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT:    call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT:    f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT:    call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT:    call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT:    f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT:    call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT:    i32.store16 0($0), $pop71
+; NOSIMD-NEXT:    return
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
 define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_contract_4xf32:
 ; RELAXED:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16:         .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD:         .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
-; TODO: This should also have relaxed_madd in RELAXED case
 define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fmuladd_4xf32:
 ; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT:    return $pop1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fmuladd_4xf32:
 ; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
 ; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
 ; STRICT-NEXT:    f32x4.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD:         .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $4, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $3, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $2, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $1, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop7
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
 
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT:    f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT:    v128.store 16($0), $pop1
+; RELAXED-NEXT:    f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT:    f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT:    v128.store 0($0), $pop3
+; RELAXED-NEXT:    return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $2, $4
+; STRICT-NEXT:    f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT:    v128.store 16($0), $pop1
+; STRICT-NEXT:    f32x4.mul $push2=, $1, $3
+; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT:    v128.store 0($0), $pop3
+; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16:         .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD:         .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $8, $16
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $7, $15
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $6, $14
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $5, $13
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $4, $12
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $3, $11
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $2, $10
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $1, $9
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
+  %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16:         .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD:         .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD:         .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $2, $4
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $1, $3
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
+
 define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; RELAXED-LABEL: fma_4xf32:
 ; RELAXED:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; STRICT-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
 ; STRICT-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
 ; STRICT-NEXT:    return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16:         .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT:    f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT:    call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT:    f32x4.splat $push4=, $pop3
+; NOFP16-NEXT:    f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT:    f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT:    call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT:    f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT:    f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT:    f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT:    call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT:    f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT:    f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT:    f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT:    call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT:    f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT:    return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD:         .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT:    f32.store 12($0), $pop0
+; NOSIMD-NEXT:    call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT:    f32.store 8($0), $pop1
+; NOSIMD-NEXT:    call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT:    f32.store 4($0), $pop2
+; NOSIMD-NEXT:    call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT:    f32.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %fma
 }
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fadd_fmul_contract_8xf32:
 ; RELAXED:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_madd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_madd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; STRICT-NEXT:    f32x4.add $push3=, $pop2, $5
 ; STRICT-NEXT:    v128.store 0($0), $pop3
 ; STRICT-NEXT:    return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16:         .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT:    f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT:    v128.store 16($0), $pop1
+; NOFP16-NEXT:    f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT:    f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT:    v128.store 0($0), $pop3
+; NOFP16-NEXT:    return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD:         .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f32.mul $push0=, $16, $8
+; NOSIMD-NEXT:    f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT:    f32.store 28($0), $pop1
+; NOSIMD-NEXT:    f32.mul $push2=, $15, $7
+; NOSIMD-NEXT:    f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT:    f32.store 24($0), $pop3
+; NOSIMD-NEXT:    f32.mul $push4=, $14, $6
+; NOSIMD-NEXT:    f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT:    f32.store 20($0), $pop5
+; NOSIMD-NEXT:    f32.mul $push6=, $13, $5
+; NOSIMD-NEXT:    f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT:    f32.store 16($0), $pop7
+; NOSIMD-NEXT:    f32.mul $push8=, $12, $4
+; NOSIMD-NEXT:    f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT:    f32.store 12($0), $pop9
+; NOSIMD-NEXT:    f32.mul $push10=, $11, $3
+; NOSIMD-NEXT:    f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT:    f32.store 8($0), $pop11
+; NOSIMD-NEXT:    f32.mul $push12=, $10, $2
+; NOSIMD-NEXT:    f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT:    f32.store 4($0), $pop13
+; NOSIMD-NEXT:    f32.mul $push14=, $9, $1
+; NOSIMD-NEXT:    f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT:    f32.store 0($0), $pop15
+; NOSIMD-NEXT:    return
   %mul = fmul contract <8 x float> %b, %a
   %add = fadd contract <8 x float> %mul, %c
   ret <8 x float> %add
 }
 
-
 define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; RELAXED-LABEL: fadd_fmul_contract_2xf64:
 ; RELAXED:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_madd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
 ; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16:         .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD:         .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
   %mul = fmul contract <2 x double> %b, %a
   %add = fadd contract <2 x double> %mul, %c
   ret <2 x double> %add
 }
 
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32.mul $push0=, $1, $0
-; RELAXED-NEXT:    f32.add $push1=, $pop0, $2
+; RELAXED-NEXT:    f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT:    f64x2.add $push1=, $pop0, $2
 ; RELAXED-NEXT:    return $pop1
 ;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT:         .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f32.mul $push0=, $1, $0
-; STRICT-NEXT:    f32.add $push1=, $pop0, $2
+; STRICT-NEXT:    f64x2.mul $push0=, $1, $0
+; STRICT-NEXT:    f64x2.add $push1=, $pop0, $2
 ; STRICT-NEXT:    return $pop1
-  %mul = fmul contract float %b, %a
-  %add = fadd contract float %mul, %c
-  ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16:         .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT:    f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT:    return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD:         .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    f64.mul $push0=, $4, $2
+; NOSIMD-NEXT:    f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT:    f64.store 8($0), $pop1
+; NOSIMD-NEXT:    f64.mul $push2=, $3, $1
+; NOSIMD-NEXT:    f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT:    f64.store 0($0), $pop3
+; NOSIMD-NEXT:    return
+  %mul = fmul <2 x double> %b, %a
+  %add = fadd <2 x double> %mul, %c
+  ret <2 x double> %add
 }
 
 define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fmaf, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD:         .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call float @llvm.fma(float %a, float %b, float %c)
   ret float %fma
 }
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
 ; STRICT-NEXT:  # %bb.0:
 ; STRICT-NEXT:    call $push0=, fma, $0, $1, $2
 ; STRICT-NEXT:    return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT:  # %bb.0:
+; NOFP16-NEXT:    call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT:    return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD:         .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT:  # %bb.0:
+; NOSIMD-NEXT:    call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT:    return $pop0
   %fma = call double @llvm.fma(double %a, double %b, double %c)
   ret double %fma
 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
 ; RELAXED-LABEL: fsub_fmul_contract_4xf32:
 ; RELAXED:         .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
 ; RELAXED-LABEL: fsub_fmul_contract_8xf16:
 ; RELAXED:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_8xf16:
 ; STRICT:         .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
 ; STRICT-NEXT:  # %bb.0:
-; STRICT-NEXT:    f16x8.mul $push0=, $1, $0
-; STRICT-NEXT:    f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT:    return $pop1
+; STRICT-NEXT:    f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT:    return $pop0
   %mul = fmul contract <8 x half> %b, %a
   %sub = fsub contract <8 x half> %c, %mul
   ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
 ; RELAXED-LABEL: fsub_fmul_contract_8xf32:
 ; RELAXED:         .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $4, $2, $6
 ; RELAXED-NEXT:    v128.store 16($0), $pop0
-; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push1=, $3, $1, $5
 ; RELAXED-NEXT:    v128.store 0($0), $pop1
 ; RELAXED-NEXT:    return
 ;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
 ; RELAXED-LABEL: fsub_fmul_contract_2xf64:
 ; RELAXED:         .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
 ; RELAXED-NEXT:  # %bb.0:
-; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $1, $0, $2
 ; RELAXED-NEXT:    return $pop0
 ;
 ; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
   ret float %sub
 }
 
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT:         .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT:    return $pop0
+  %fneg = fneg <8 x half> %a
+  %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+  ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT:         .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f32x4.mul $push0=, $0, $1
+; STRICT-NEXT:    f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <4 x float> %a
+  %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT:  # %bb.0:
+; RELAXED-NEXT:    f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT:    return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT:         .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT:  # %bb.0:
+; STRICT-NEXT:    f64x2.mul $push0=, $0, $1
+; STRICT-NEXT:    f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT:    return $pop1
+  %fneg = fneg <2 x double> %a
+  %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a..5152c005 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
 define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v64i8_2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rsi), %xmm0
-; SSE2-NEXT:    movaps 16(%rsi), %xmm1
-; SSE2-NEXT:    movaps 32(%rsi), %xmm2
-; SSE2-NEXT:    movaps 48(%rsi), %xmm3
-; SSE2-NEXT:    movups %xmm3, (%rax)
-; SSE2-NEXT:    movups %xmm2, (%rax)
-; SSE2-NEXT:    movups %xmm1, (%rax)
-; SSE2-NEXT:    movups %xmm0, (%rax)
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
+; SSE2-NEXT:    pavgb (%rsi), %xmm0
+; SSE2-NEXT:    pavgb 16(%rsi), %xmm1
+; SSE2-NEXT:    pavgb 32(%rsi), %xmm2
+; SSE2-NEXT:    pavgb 48(%rsi), %xmm3
+; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: avg_v64i8_2:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rsi), %ymm0
-; AVX1-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX1-NEXT:    vmovups %ymm1, (%rax)
-; AVX1-NEXT:    vmovups %ymm0, (%rax)
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT:    vpavgb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqu %xmm3, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm2, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps (%rsi), %ymm0
-; AVX2-NEXT:    vmovaps 32(%rsi), %ymm1
-; AVX2-NEXT:    vmovups %ymm1, (%rax)
-; AVX2-NEXT:    vmovups %ymm0, (%rax)
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX2-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: avg_v64i8_2:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rsi), %zmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: avg_v64i8_2:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT:    vpavgb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, ptr %a
   %2 = load <64 x i8>, ptr %b
   %3 = zext <64 x i8> %1 to <64 x i32>
   %4 = zext <64 x i8> %2 to <64 x i32>
-  %5 = add nuw nsw <64 x i32> %4, %4
+  %5 = add nuw nsw <64 x i32> %3, %4
   %6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %8 = trunc <64 x i32> %7 to <64 x i8>
@@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
   ret void
 }
 
-
 define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: avg_v4i16_2:
 ; SSE2:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..f3950b7 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,16 +1,15 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
-
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+;; A minimal test case. Subsequent PRs will expand on this test case
+;; (e.g., with more functions, variables and profiles) and test the hotness
+;; reconcillation implementation.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
 ; RUN:     -partition-static-data-sections=true \
 ; RUN:     -data-sections=true  -unique-section-names=false \
-; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN:     %s -o - 2>&1 | FileCheck %s --check-prefix=IR
 
-; ERR: Global variable hot_bss already has a section prefix hot
+; IR: .section .bss.hot.,"aw"
 
 @hot_bss = internal global i32 0, !section_prefix !17
 
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; UNIQ-NEXT:   .section	.data.unlikely.,"aw",@progbits,unique,8
 ; AGG-NEXT:    .section	.data.unlikely.,"aw",@progbits
 
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; For @data_with_unknown_hotness
 ; SYM: 	       .type	.Ldata_with_unknown_hotness,@object          # @data_with_unknown_hotness
 ; SYM:         .section .data..Ldata_with_unknown_hotness,"aw",@progbits
 ; UNIQ:        .section  .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
 ; AGG:         .data
 ; COMMON:      .Ldata_with_unknown_hotness:
 
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
 ; COMMON:      .type hot_data_custom_bar_section,@object
 ; SYM-NEXT:    .section bar,"aw",@progbits
 ; SYM:         hot_data_custom_bar_section
 ; UNIQ:        .section bar,"aw",@progbits
 ; AGG:         .section bar,"aw",@progbits
 
+; SYM:      .section .data.llvm.fake_var,"aw"
+; UNIQ:     .section .data,"aw"
+; AGG:      .data
+
+;; No section for linker declaration
+; COMMON-NOT:  qux
+
 @.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
 @.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
 @hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @data3 = internal global i32 3
 @data_with_unknown_hotness = private global i32 5
 @hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
 
 define void @cold_func(i32 %0) !prof !15 {
   %2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll
index ea22b08..954ea8f 100644
--- a/llvm/test/CodeGen/X86/relptr-rodata.ll
+++ b/llvm/test/CodeGen/X86/relptr-rodata.ll
@@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: .long hidden-rodata
 @rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32)
 
+; CHECK: .section .rodata.rodata_ptrtoaddr
+; CHECK: rodata_ptrtoaddr:
+; CHECK: .long hidden-rodata_ptrtoaddr
+@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro1
 ; CHECK: relro1:
 ; CHECK: .long default-relro1
 @relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro1_ptrtoaddr
+; CHECK: relro1_ptrtoaddr:
+; CHECK: .long default-relro1_ptrtoaddr
+@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32)
+
 ; CHECK: .section .data.rel.ro.relro2
 ; CHECK: relro2:
 ; CHECK: .long hidden-relro2
 @relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32)
 
+; CHECK: .section .data.rel.ro.relro2_ptrtoaddr
+; CHECK: relro2_ptrtoaddr:
+; CHECK: .long hidden-relro2_ptrtoaddr
+@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32)
+
 ; CHECK:      .section .rodata.obj
 ; CHECK-NEXT: .globl obj
 ; CHECK:      obj:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..69abf6e 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,158 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
   %r = icmp eq i512 %a, %b
   ret i1 %r
 }
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    orq 8(%rdi), %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp ne i128 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm0
+; SSE41-NEXT:    setb %al
+; SSE41-NEXT:    retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY:       # %bb.0:
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT:    vptest %xmm1, %xmm0
+; AVXANY-NEXT:    setb %al
+; AVXANY-NEXT:    retq
+  %ld = load i128, ptr %w
+  %cmp = icmp eq i128 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i256_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq (%rdi), %rax
+; ANY-NEXT:    movq 8(%rdi), %rcx
+; ANY-NEXT:    orq 24(%rdi), %rcx
+; ANY-NEXT:    orq 16(%rdi), %rax
+; ANY-NEXT:    orq %rcx, %rax
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp ne i256 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    andq 24(%rdi), %rcx
+; SSE-NEXT:    andq 16(%rdi), %rax
+; SSE-NEXT:    andq %rcx, %rax
+; SSE-NEXT:    cmpq $-1, %rax
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vptest %ymm1, %ymm0
+; AVX1-NEXT:    setb %al
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vptest %ymm1, %ymm0
+; AVX2-NEXT:    setb %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT:    vptest %ymm1, %ymm0
+; AVX512-NEXT:    setb %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i256, ptr %w
+  %cmp = icmp eq i256 %ld, -1
+  ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i512_load_arg:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movq 16(%rdi), %rax
+; ANY-NEXT:    movq (%rdi), %rcx
+; ANY-NEXT:    movq 8(%rdi), %rdx
+; ANY-NEXT:    movq 24(%rdi), %rsi
+; ANY-NEXT:    orq 56(%rdi), %rsi
+; ANY-NEXT:    orq 40(%rdi), %rdx
+; ANY-NEXT:    orq %rsi, %rdx
+; ANY-NEXT:    orq 48(%rdi), %rax
+; ANY-NEXT:    orq 32(%rdi), %rcx
+; ANY-NEXT:    orq %rax, %rcx
+; ANY-NEXT:    orq %rdx, %rcx
+; ANY-NEXT:    setne %al
+; ANY-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp ne i512 %ld, 0
+  ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512:       # %bb.0:
+; NO512-NEXT:    movq 16(%rdi), %rax
+; NO512-NEXT:    movq (%rdi), %rcx
+; NO512-NEXT:    movq 8(%rdi), %rdx
+; NO512-NEXT:    movq 24(%rdi), %rsi
+; NO512-NEXT:    andq 56(%rdi), %rsi
+; NO512-NEXT:    andq 40(%rdi), %rdx
+; NO512-NEXT:    andq %rsi, %rdx
+; NO512-NEXT:    andq 48(%rdi), %rax
+; NO512-NEXT:    andq 32(%rdi), %rcx
+; NO512-NEXT:    andq %rax, %rcx
+; NO512-NEXT:    andq %rdx, %rcx
+; NO512-NEXT:    cmpq $-1, %rcx
+; NO512-NEXT:    sete %al
+; NO512-NEXT:    retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT:    vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT:    kortestw %k0, %k0
+; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %ld = load i512, ptr %w
+  %cmp = icmp eq i512 %ld, -1
+  ret i1 %cmp
+}